From 12e08417d625bbacde478cfaaa49fc053ff97ffd Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sun, 10 Feb 2019 18:42:56 +0300 Subject: [PATCH 01/43] Template format implementation --- dbms/src/Common/ErrorCodes.cpp | 1 + .../BlockOutputStreamFromRowOutputStream.h | 2 +- dbms/src/Formats/FormatFactory.cpp | 2 + dbms/src/Formats/TemplateRowOutputStream.cpp | 180 ++++++++++++++++++ dbms/src/Formats/TemplateRowOutputStream.h | 52 +++++ 5 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 dbms/src/Formats/TemplateRowOutputStream.cpp create mode 100644 dbms/src/Formats/TemplateRowOutputStream.h diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index a3b788c230f..158f5cbcf65 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -434,6 +434,7 @@ namespace ErrorCodes extern const int BAD_QUERY_PARAMETER = 457; extern const int CANNOT_UNLINK = 458; extern const int CANNOT_SET_THREAD_PRIORITY = 459; + extern const int INVALID_TEMPLATE_FORMAT = 460; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Formats/BlockOutputStreamFromRowOutputStream.h b/dbms/src/Formats/BlockOutputStreamFromRowOutputStream.h index ada924bb5b4..5cf20955bb8 100644 --- a/dbms/src/Formats/BlockOutputStreamFromRowOutputStream.h +++ b/dbms/src/Formats/BlockOutputStreamFromRowOutputStream.h @@ -29,7 +29,7 @@ public: String getContentType() const override { return row_output->getContentType(); } -private: +protected: RowOutputStreamPtr row_output; Block header; bool first_row = true; diff --git a/dbms/src/Formats/FormatFactory.cpp b/dbms/src/Formats/FormatFactory.cpp index 4fae140abee..d1364892811 100644 --- a/dbms/src/Formats/FormatFactory.cpp +++ b/dbms/src/Formats/FormatFactory.cpp @@ -197,6 +197,7 @@ void registerInputFormatParquet(FormatFactory & factory); void registerOutputFormatParquet(FormatFactory & factory); void registerInputFormatProtobuf(FormatFactory & factory); void registerOutputFormatProtobuf(FormatFactory & factory); +void registerOutputFormatTemplate(FormatFactory &factory); void registerInputFormatProcessorNative(FormatFactory & factory); void registerOutputFormatProcessorNative(FormatFactory & factory); @@ -270,6 +271,7 @@ FormatFactory::FormatFactory() registerInputFormatCapnProto(*this); registerInputFormatParquet(*this); registerOutputFormatParquet(*this); + registerOutputFormatTemplate(*this); registerOutputFormatMySQLWire(*this); diff --git a/dbms/src/Formats/TemplateRowOutputStream.cpp b/dbms/src/Formats/TemplateRowOutputStream.cpp new file mode 100644 index 00000000000..b2bf730d6f4 --- /dev/null +++ b/dbms/src/Formats/TemplateRowOutputStream.cpp @@ -0,0 +1,180 @@ +#include +#include +#include +#include + + +namespace DB { + + +namespace ErrorCodes { + extern const int INVALID_TEMPLATE_FORMAT; +} + +TemplateRowOutputStream::TemplateRowOutputStream(WriteBuffer &ostr_, const Block &sample, + const FormatSettings &settings_, const String& format_template) + : ostr(ostr_), settings(settings_) +{ + parseFormatString(format_template, sample); +} + + +void TemplateRowOutputStream::parseFormatString(const String & s, const Block & sample) +{ + enum ParserState + { + Delimiter, + Column, + Format + }; + const char * pos = s.c_str(); + const char * token_begin = pos; + ParserState state = Delimiter; + delimiters.emplace_back(); + for (; *pos; ++pos) + { + switch (state) + { + case Delimiter: + if (*pos == '$') + { + delimiters.back().append(token_begin, pos - token_begin); + ++pos; + if (*pos == '{') + { + token_begin = pos + 1; + state = Column; + } + else if (*pos == '$') + { + token_begin = pos; + } + else + { + throw Exception("invalid template: pos " + std::to_string(pos - s.c_str()) + + ": expected '{' or '$' after '$'", ErrorCodes::INVALID_TEMPLATE_FORMAT); + } + } + break; + + case Column: + if (*pos == ':') + { + size_t column_idx = sample.getPositionByName(String(token_begin, pos - token_begin)); + format_idx_to_column_idx.push_back(column_idx); + token_begin = pos + 1; + state = Format; + } + else if (*pos == '}') + { + size_t column_idx = sample.getPositionByName(String(token_begin, pos - token_begin)); + format_idx_to_column_idx.push_back(column_idx); + formats.push_back(ColumnFormat::Default); + delimiters.emplace_back(); + token_begin = pos + 1; + state = Delimiter; + } + break; + + case Format: + if (*pos == '}') + { + formats.push_back(stringToFormat(String(token_begin, pos - token_begin))); + token_begin = pos + 1; + delimiters.emplace_back(); + state = Delimiter; + } + } + } + if (state != Delimiter) + throw Exception("invalid template: check parentheses balance", ErrorCodes::INVALID_TEMPLATE_FORMAT); + if (delimiters.size() == 1) + throw Exception("invalid template: no columns specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); + delimiters.back().append(token_begin, pos - token_begin); +} + + +TemplateRowOutputStream::ColumnFormat TemplateRowOutputStream::stringToFormat(const String & format) +{ + if (format.empty()) + return ColumnFormat::Default; + else if (format == "Escaped") + return ColumnFormat::Escaped; + else if (format == "Quoted") + return ColumnFormat::Quoted; + else if (format == "JSON") + return ColumnFormat::Json; + else if (format == "XML") + return ColumnFormat::Xml; + else if (format == "Raw") + return ColumnFormat::Raw; + else + throw Exception("invalid template: unknown field format " + format, ErrorCodes::INVALID_TEMPLATE_FORMAT); + +} + +void TemplateRowOutputStream::flush() +{ + ostr.next(); +} + +void TemplateRowOutputStream::serializeField(const ColumnWithTypeAndName & col, size_t row_num, ColumnFormat format) +{ + switch (format) + { + case ColumnFormat::Default: + case ColumnFormat::Escaped: + col.type->serializeAsTextEscaped(*col.column, row_num, ostr, settings); + break; + case ColumnFormat::Quoted: + col.type->serializeAsTextQuoted(*col.column, row_num, ostr, settings); + break; + case ColumnFormat::Json: + col.type->serializeAsTextJSON(*col.column, row_num, ostr, settings); + break; + case ColumnFormat::Xml: + col.type->serializeAsTextXML(*col.column, row_num, ostr, settings); + break; + case ColumnFormat::Raw: + col.type->serializeAsText(*col.column, row_num, ostr, settings); + break; + default: + __builtin_unreachable(); + } +} + +void TemplateRowOutputStream::write(const Block & block, size_t row_num) +{ + size_t columns = format_idx_to_column_idx.size(); + for (size_t i = 0; i < columns; ++i) + { + writeString(delimiters[i], ostr); + + size_t col_idx = format_idx_to_column_idx[i]; + const ColumnWithTypeAndName & col = block.getByPosition(col_idx); + serializeField(col, row_num, formats[i]); + } + writeString(delimiters[columns], ostr); +} + +void TemplateBlockOutputStream::write(const Block & block) +{ + size_t rows = block.rows(); + for (size_t i = 0; i < rows; ++i) + row_output->write(block, i); + +} + +void registerOutputFormatTemplate(FormatFactory &factory) +{ + factory.registerOutputFormat("Template", []( + WriteBuffer &buf, + const Block &sample, + const Context & context, + const FormatSettings &settings) { + auto format_template = context.getSettingsRef().format_schema.toString(); + return std::make_shared( + std::make_shared(buf, sample, settings, format_template), sample); + }); +} +} diff --git a/dbms/src/Formats/TemplateRowOutputStream.h b/dbms/src/Formats/TemplateRowOutputStream.h new file mode 100644 index 00000000000..ab7b60ee2ab --- /dev/null +++ b/dbms/src/Formats/TemplateRowOutputStream.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +class TemplateRowOutputStream : public IRowOutputStream +{ +public: + enum class ColumnFormat + { + Default, + Escaped, + Quoted, + Json, + Xml, + Raw + }; + + TemplateRowOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_, const String & format_template); + + void write(const Block & block, size_t row_num) override; + void writeField(const IColumn &, const IDataType &, size_t) override {}; + void flush() override; + +private: + ColumnFormat stringToFormat(const String & format); + void parseFormatString(const String & s, const Block & sample); + void serializeField(const ColumnWithTypeAndName & col, size_t row_num, ColumnFormat format); + +private: + WriteBuffer & ostr; + const FormatSettings settings; + std::vector delimiters; + std::vector formats; + std::vector format_idx_to_column_idx; +}; + +class TemplateBlockOutputStream : public BlockOutputStreamFromRowOutputStream +{ +public: + TemplateBlockOutputStream(RowOutputStreamPtr row_output_, const Block & header_) + : BlockOutputStreamFromRowOutputStream(row_output_, header_) {}; + void write(const Block & block) override; +}; + +} From 4f7720139a5daf1468934ded5c677ae449197fe7 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 12 Feb 2019 22:40:03 +0300 Subject: [PATCH 02/43] TemplateRowOutputStream changed to TemplateBlockOutputStream --- .../BlockOutputStreamFromRowOutputStream.h | 2 +- ...ream.cpp => TemplateBlockOutputStream.cpp} | 73 +++++++++++-------- dbms/src/Formats/TemplateBlockOutputStream.h | 58 +++++++++++++++ dbms/src/Formats/TemplateRowOutputStream.h | 52 ------------- 4 files changed, 100 insertions(+), 85 deletions(-) rename dbms/src/Formats/{TemplateRowOutputStream.cpp => TemplateBlockOutputStream.cpp} (76%) create mode 100644 dbms/src/Formats/TemplateBlockOutputStream.h delete mode 100644 dbms/src/Formats/TemplateRowOutputStream.h diff --git a/dbms/src/Formats/BlockOutputStreamFromRowOutputStream.h b/dbms/src/Formats/BlockOutputStreamFromRowOutputStream.h index 5cf20955bb8..ada924bb5b4 100644 --- a/dbms/src/Formats/BlockOutputStreamFromRowOutputStream.h +++ b/dbms/src/Formats/BlockOutputStreamFromRowOutputStream.h @@ -29,7 +29,7 @@ public: String getContentType() const override { return row_output->getContentType(); } -protected: +private: RowOutputStreamPtr row_output; Block header; bool first_row = true; diff --git a/dbms/src/Formats/TemplateRowOutputStream.cpp b/dbms/src/Formats/TemplateBlockOutputStream.cpp similarity index 76% rename from dbms/src/Formats/TemplateRowOutputStream.cpp rename to dbms/src/Formats/TemplateBlockOutputStream.cpp index b2bf730d6f4..e8d67804a13 100644 --- a/dbms/src/Formats/TemplateRowOutputStream.cpp +++ b/dbms/src/Formats/TemplateBlockOutputStream.cpp @@ -1,25 +1,26 @@ -#include +#include #include #include #include -namespace DB { +namespace DB +{ - -namespace ErrorCodes { +namespace ErrorCodes +{ extern const int INVALID_TEMPLATE_FORMAT; } -TemplateRowOutputStream::TemplateRowOutputStream(WriteBuffer &ostr_, const Block &sample, +TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer &ostr_, const Block &sample, const FormatSettings &settings_, const String& format_template) - : ostr(ostr_), settings(settings_) + : ostr(ostr_), header(sample), settings(settings_) { - parseFormatString(format_template, sample); + parseFormatString(format_template); } -void TemplateRowOutputStream::parseFormatString(const String & s, const Block & sample) +void TemplateBlockOutputStream::parseFormatString(const String & s) { enum ParserState { @@ -60,14 +61,14 @@ void TemplateRowOutputStream::parseFormatString(const String & s, const Block & case Column: if (*pos == ':') { - size_t column_idx = sample.getPositionByName(String(token_begin, pos - token_begin)); + size_t column_idx = header.getPositionByName(String(token_begin, pos - token_begin)); format_idx_to_column_idx.push_back(column_idx); token_begin = pos + 1; state = Format; } else if (*pos == '}') { - size_t column_idx = sample.getPositionByName(String(token_begin, pos - token_begin)); + size_t column_idx = header.getPositionByName(String(token_begin, pos - token_begin)); format_idx_to_column_idx.push_back(column_idx); formats.push_back(ColumnFormat::Default); delimiters.emplace_back(); @@ -94,7 +95,7 @@ void TemplateRowOutputStream::parseFormatString(const String & s, const Block & } -TemplateRowOutputStream::ColumnFormat TemplateRowOutputStream::stringToFormat(const String & format) +TemplateBlockOutputStream::ColumnFormat TemplateBlockOutputStream::stringToFormat(const String & format) { if (format.empty()) return ColumnFormat::Default; @@ -113,12 +114,12 @@ TemplateRowOutputStream::ColumnFormat TemplateRowOutputStream::stringToFormat(co } -void TemplateRowOutputStream::flush() +void TemplateBlockOutputStream::flush() { ostr.next(); } -void TemplateRowOutputStream::serializeField(const ColumnWithTypeAndName & col, size_t row_num, ColumnFormat format) +void TemplateBlockOutputStream::serializeField(const ColumnWithTypeAndName & col, size_t row_num, ColumnFormat format) { switch (format) { @@ -143,38 +144,46 @@ void TemplateRowOutputStream::serializeField(const ColumnWithTypeAndName & col, } } -void TemplateRowOutputStream::write(const Block & block, size_t row_num) -{ - size_t columns = format_idx_to_column_idx.size(); - for (size_t i = 0; i < columns; ++i) - { - writeString(delimiters[i], ostr); - - size_t col_idx = format_idx_to_column_idx[i]; - const ColumnWithTypeAndName & col = block.getByPosition(col_idx); - serializeField(col, row_num, formats[i]); - } - writeString(delimiters[columns], ostr); -} - void TemplateBlockOutputStream::write(const Block & block) { size_t rows = block.rows(); - for (size_t i = 0; i < rows; ++i) - row_output->write(block, i); + size_t columns = format_idx_to_column_idx.size(); + for (size_t i = 0; i < rows; ++i) + { + for (size_t j = 0; j < columns; ++j) + { + writeString(delimiters[j], ostr); + + size_t col_idx = format_idx_to_column_idx[j]; + const ColumnWithTypeAndName & col = block.getByPosition(col_idx); + serializeField(col, i, formats[j]); + } + writeString(delimiters[columns], ostr); + } } +void TemplateBlockOutputStream::writePrefix() +{ + // TODO +} + +void TemplateBlockOutputStream::writeSuffix() +{ + // TODO +} + + void registerOutputFormatTemplate(FormatFactory &factory) { factory.registerOutputFormat("Template", []( WriteBuffer &buf, const Block &sample, const Context & context, - const FormatSettings &settings) { + const FormatSettings &settings) + { auto format_template = context.getSettingsRef().format_schema.toString(); - return std::make_shared( - std::make_shared(buf, sample, settings, format_template), sample); + return std::make_shared(buf, sample, settings, format_template); }); } } diff --git a/dbms/src/Formats/TemplateBlockOutputStream.h b/dbms/src/Formats/TemplateBlockOutputStream.h new file mode 100644 index 00000000000..3953d75e2cc --- /dev/null +++ b/dbms/src/Formats/TemplateBlockOutputStream.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +class TemplateBlockOutputStream : public IBlockOutputStream +{ +public: + enum class ColumnFormat + { + Default, + Escaped, + Quoted, + Json, + Xml, + Raw + }; + + TemplateBlockOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_, const String & format_template); + Block getHeader() const override { return header; } + + void write(const Block & block) override; + void writePrefix() override; + void writeSuffix() override; + + void flush() override; + + void setRowsBeforeLimit(size_t rows_before_limit_) override { rows_before_limit = rows_before_limit_; } + void setTotals(const Block & totals_) override { totals = totals_; } + void setExtremes(const Block & extremes_) override { extremes = extremes_; } + void onProgress(const Progress & progress_) override { progress.incrementPiecewiseAtomically(progress_); } + +private: + ColumnFormat stringToFormat(const String & format); + void parseFormatString(const String & s); + void serializeField(const ColumnWithTypeAndName & col, size_t row_num, ColumnFormat format); + +private: + WriteBuffer & ostr; + Block header; + const FormatSettings settings; + std::vector delimiters; + std::vector formats; + std::vector format_idx_to_column_idx; + + size_t rows_before_limit; + Block totals; + Block extremes; + Progress progress; +}; + +} diff --git a/dbms/src/Formats/TemplateRowOutputStream.h b/dbms/src/Formats/TemplateRowOutputStream.h deleted file mode 100644 index ab7b60ee2ab..00000000000 --- a/dbms/src/Formats/TemplateRowOutputStream.h +++ /dev/null @@ -1,52 +0,0 @@ -#pragma once - -#include -#include -#include -#include - - -namespace DB -{ - -class TemplateRowOutputStream : public IRowOutputStream -{ -public: - enum class ColumnFormat - { - Default, - Escaped, - Quoted, - Json, - Xml, - Raw - }; - - TemplateRowOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_, const String & format_template); - - void write(const Block & block, size_t row_num) override; - void writeField(const IColumn &, const IDataType &, size_t) override {}; - void flush() override; - -private: - ColumnFormat stringToFormat(const String & format); - void parseFormatString(const String & s, const Block & sample); - void serializeField(const ColumnWithTypeAndName & col, size_t row_num, ColumnFormat format); - -private: - WriteBuffer & ostr; - const FormatSettings settings; - std::vector delimiters; - std::vector formats; - std::vector format_idx_to_column_idx; -}; - -class TemplateBlockOutputStream : public BlockOutputStreamFromRowOutputStream -{ -public: - TemplateBlockOutputStream(RowOutputStreamPtr row_output_, const Block & header_) - : BlockOutputStreamFromRowOutputStream(row_output_, header_) {}; - void write(const Block & block) override; -}; - -} From 05d6e23373a5ca994bce530cc54e71f5a774e635 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 18 Feb 2019 00:23:44 +0300 Subject: [PATCH 03/43] Customizable prefix, suffix and row delimiter --- dbms/src/Core/Settings.h | 2 + dbms/src/Formats/FormatFactory.cpp | 3 + dbms/src/Formats/FormatSettings.h | 9 + .../src/Formats/TemplateBlockOutputStream.cpp | 221 +++++++++++++----- dbms/src/Formats/TemplateBlockOutputStream.h | 41 +++- 5 files changed, 218 insertions(+), 58 deletions(-) diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index 4df07df60f6..870539e2dab 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -212,6 +212,8 @@ struct Settings : public SettingsCollection M(SettingMilliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.") \ M(SettingMilliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from streaming storages.") \ M(SettingString, format_schema, "", "Schema identifier (used by schema-based formats)") \ + M(SettingString, format_schema_rows, "", "Row format string for Template format") \ + M(SettingString, format_schema_rows_between_delimiter, "\n", "Delimiter between rows for Template format") \ M(SettingBool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.") \ M(SettingSeconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.") \ M(SettingSeconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout") \ diff --git a/dbms/src/Formats/FormatFactory.cpp b/dbms/src/Formats/FormatFactory.cpp index d1364892811..4ed5a59a94c 100644 --- a/dbms/src/Formats/FormatFactory.cpp +++ b/dbms/src/Formats/FormatFactory.cpp @@ -65,6 +65,9 @@ static FormatSettings getOutputFormatSetting(const Settings & settings) format_settings.pretty.max_rows = settings.output_format_pretty_max_rows; format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width; format_settings.pretty.color = settings.output_format_pretty_color; + format_settings.template_settings.format = settings.format_schema; + format_settings.template_settings.row_format = settings.format_schema_rows; + format_settings.template_settings.row_between_delimiter = settings.format_schema_rows_between_delimiter; format_settings.write_statistics = settings.output_format_write_statistics; format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size; diff --git a/dbms/src/Formats/FormatSettings.h b/dbms/src/Formats/FormatSettings.h index 0bb71e6e50e..818adea3c96 100644 --- a/dbms/src/Formats/FormatSettings.h +++ b/dbms/src/Formats/FormatSettings.h @@ -48,6 +48,15 @@ struct FormatSettings Values values; + struct Template + { + String format; + String row_format; + String row_between_delimiter; + }; + + Template template_settings; + bool skip_unknown_fields = false; bool with_names_use_header = false; bool write_statistics = true; diff --git a/dbms/src/Formats/TemplateBlockOutputStream.cpp b/dbms/src/Formats/TemplateBlockOutputStream.cpp index e8d67804a13..1950655dce6 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.cpp +++ b/dbms/src/Formats/TemplateBlockOutputStream.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB @@ -12,15 +13,50 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT; } -TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer &ostr_, const Block &sample, - const FormatSettings &settings_, const String& format_template) +TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_) : ostr(ostr_), header(sample), settings(settings_) { - parseFormatString(format_template); + static const String default_format("${result}"); + const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; + format = parseFormatString(format_str, [&](const String & partName) + { + return static_cast(stringToOutputPart(partName)); + }); + + size_t resultIdx = format.format_idx_to_column_idx.size() + 1; + for (size_t i = 0; i < format.format_idx_to_column_idx.size(); ++i) + { + switch (static_cast(format.format_idx_to_column_idx[i])) + { + case OutputPart::Result: + resultIdx = i; + BOOST_FALLTHROUGH; + case OutputPart::Totals: + case OutputPart::ExtremesMin: + case OutputPart::ExtremesMax: + if (format.formats[i] != ColumnFormat::Default) + throw Exception("invalid template: wrong serialization type for result, totals, min or max", + ErrorCodes::INVALID_TEMPLATE_FORMAT); + break; + default: + break; + } + } + + if (resultIdx != 0) + throw Exception("invalid template: ${result} must be the first output part", ErrorCodes::INVALID_TEMPLATE_FORMAT); + + row_format = parseFormatString(settings.template_settings.row_format, [&](const String & colName) + { + return header.getPositionByName(colName); + }); + + if (row_format.delimiters.size() == 1) + throw Exception("invalid template: no columns specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); } -void TemplateBlockOutputStream::parseFormatString(const String & s) +TemplateBlockOutputStream::ParsedFormat TemplateBlockOutputStream::parseFormatString(const String & s, const ColumnIdxGetter & idxByName) { enum ParserState { @@ -28,10 +64,11 @@ void TemplateBlockOutputStream::parseFormatString(const String & s) Column, Format }; + ParsedFormat parsed_format; const char * pos = s.c_str(); const char * token_begin = pos; ParserState state = Delimiter; - delimiters.emplace_back(); + parsed_format.delimiters.emplace_back(); for (; *pos; ++pos) { switch (state) @@ -39,7 +76,7 @@ void TemplateBlockOutputStream::parseFormatString(const String & s) case Delimiter: if (*pos == '$') { - delimiters.back().append(token_begin, pos - token_begin); + parsed_format.delimiters.back().append(token_begin, pos - token_begin); ++pos; if (*pos == '{') { @@ -61,17 +98,17 @@ void TemplateBlockOutputStream::parseFormatString(const String & s) case Column: if (*pos == ':') { - size_t column_idx = header.getPositionByName(String(token_begin, pos - token_begin)); - format_idx_to_column_idx.push_back(column_idx); + size_t column_idx = idxByName(String(token_begin, pos - token_begin)); + parsed_format.format_idx_to_column_idx.push_back(column_idx); token_begin = pos + 1; state = Format; } else if (*pos == '}') { - size_t column_idx = header.getPositionByName(String(token_begin, pos - token_begin)); - format_idx_to_column_idx.push_back(column_idx); - formats.push_back(ColumnFormat::Default); - delimiters.emplace_back(); + size_t column_idx = idxByName(String(token_begin, pos - token_begin)); + parsed_format.format_idx_to_column_idx.push_back(column_idx); + parsed_format.formats.push_back(ColumnFormat::Default); + parsed_format.delimiters.emplace_back(); token_begin = pos + 1; state = Delimiter; } @@ -80,38 +117,60 @@ void TemplateBlockOutputStream::parseFormatString(const String & s) case Format: if (*pos == '}') { - formats.push_back(stringToFormat(String(token_begin, pos - token_begin))); + parsed_format.formats.push_back(stringToFormat(String(token_begin, pos - token_begin))); token_begin = pos + 1; - delimiters.emplace_back(); + parsed_format.delimiters.emplace_back(); state = Delimiter; } } } if (state != Delimiter) throw Exception("invalid template: check parentheses balance", ErrorCodes::INVALID_TEMPLATE_FORMAT); - if (delimiters.size() == 1) - throw Exception("invalid template: no columns specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); - delimiters.back().append(token_begin, pos - token_begin); + parsed_format.delimiters.back().append(token_begin, pos - token_begin); + return parsed_format; } -TemplateBlockOutputStream::ColumnFormat TemplateBlockOutputStream::stringToFormat(const String & format) +TemplateBlockOutputStream::ColumnFormat TemplateBlockOutputStream::stringToFormat(const String & col_format) { - if (format.empty()) + if (col_format.empty()) return ColumnFormat::Default; - else if (format == "Escaped") + else if (col_format == "Escaped") return ColumnFormat::Escaped; - else if (format == "Quoted") + else if (col_format == "Quoted") return ColumnFormat::Quoted; - else if (format == "JSON") + else if (col_format == "JSON") return ColumnFormat::Json; - else if (format == "XML") + else if (col_format == "XML") return ColumnFormat::Xml; - else if (format == "Raw") + else if (col_format == "Raw") return ColumnFormat::Raw; else - throw Exception("invalid template: unknown field format " + format, ErrorCodes::INVALID_TEMPLATE_FORMAT); + throw Exception("invalid template: unknown field format " + col_format, ErrorCodes::INVALID_TEMPLATE_FORMAT); +} +TemplateBlockOutputStream::OutputPart TemplateBlockOutputStream::stringToOutputPart(const String & part) +{ + if (part == "result") + return OutputPart::Result; + else if (part == "totals") + return OutputPart::Totals; + else if (part == "min") + return OutputPart::ExtremesMin; + else if (part == "max") + return OutputPart::ExtremesMax; + else if (part == "rows") + return OutputPart::Rows; + else if (part == "rows_before_limit") + return OutputPart::RowsBeforeLimit; + else if (part == "time") + return OutputPart::TimeElapsed; + else if (part == "rows_read") + return OutputPart::RowsRead; + else if (part == "bytes_read") + return OutputPart::BytesRead; + else + throw Exception("invalid template: unknown output part " + part, ErrorCodes::INVALID_TEMPLATE_FORMAT); } void TemplateBlockOutputStream::flush() @@ -119,71 +178,129 @@ void TemplateBlockOutputStream::flush() ostr.next(); } -void TemplateBlockOutputStream::serializeField(const ColumnWithTypeAndName & col, size_t row_num, ColumnFormat format) +void TemplateBlockOutputStream::writeRow(const Block & block, size_t row_num) { - switch (format) + size_t columns = row_format.format_idx_to_column_idx.size(); + for (size_t j = 0; j < columns; ++j) + { + writeString(row_format.delimiters[j], ostr); + + size_t col_idx = row_format.format_idx_to_column_idx[j]; + const ColumnWithTypeAndName & col = block.getByPosition(col_idx); + serializeField(*col.column, *col.type, row_num, row_format.formats[j]); + } + writeString(row_format.delimiters[columns], ostr); +} + +void TemplateBlockOutputStream::serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat col_format) +{ + switch (col_format) { case ColumnFormat::Default: case ColumnFormat::Escaped: - col.type->serializeAsTextEscaped(*col.column, row_num, ostr, settings); + type.serializeAsTextEscaped(column, row_num, ostr, settings); break; case ColumnFormat::Quoted: - col.type->serializeAsTextQuoted(*col.column, row_num, ostr, settings); + type.serializeAsTextQuoted(column, row_num, ostr, settings); break; case ColumnFormat::Json: - col.type->serializeAsTextJSON(*col.column, row_num, ostr, settings); + type.serializeAsTextJSON(column, row_num, ostr, settings); break; case ColumnFormat::Xml: - col.type->serializeAsTextXML(*col.column, row_num, ostr, settings); + type.serializeAsTextXML(column, row_num, ostr, settings); break; case ColumnFormat::Raw: - col.type->serializeAsText(*col.column, row_num, ostr, settings); + type.serializeAsText(column, row_num, ostr, settings); break; - default: - __builtin_unreachable(); } } +template void TemplateBlockOutputStream::writeValue(U value, ColumnFormat col_format) +{ + auto type = std::make_unique(); + auto col = type->createColumn(); + col->insert(value); + serializeField(*col, *type, 0, col_format); +} + void TemplateBlockOutputStream::write(const Block & block) { size_t rows = block.rows(); - size_t columns = format_idx_to_column_idx.size(); for (size_t i = 0; i < rows; ++i) { - for (size_t j = 0; j < columns; ++j) - { - writeString(delimiters[j], ostr); + if (row_count) + writeString(settings.template_settings.row_between_delimiter, ostr); - size_t col_idx = format_idx_to_column_idx[j]; - const ColumnWithTypeAndName & col = block.getByPosition(col_idx); - serializeField(col, i, formats[j]); - } - writeString(delimiters[columns], ostr); + writeRow(block, i); + ++row_count; } } void TemplateBlockOutputStream::writePrefix() { - // TODO + writeString(format.delimiters.front(), ostr); } void TemplateBlockOutputStream::writeSuffix() { - // TODO + + size_t parts = format.format_idx_to_column_idx.size(); + + for (size_t j = 0; j < parts; ++j) + { + auto type = std::make_shared(); + ColumnWithTypeAndName col(type->createColumnConst(1, row_count), type, String("tmp")); + switch (static_cast(format.format_idx_to_column_idx[j])) + { + case OutputPart::Totals: + if (!totals) + throw Exception("invalid template: cannot print totals for this request", ErrorCodes::INVALID_TEMPLATE_FORMAT); + writeRow(totals, 0); + break; + case OutputPart::ExtremesMin: + if (!extremes) + throw Exception("invalid template: cannot print extremes for this request", ErrorCodes::INVALID_TEMPLATE_FORMAT); + writeRow(extremes, 0); + break; + case OutputPart::ExtremesMax: + if (!extremes) + throw Exception("invalid template: cannot print extremes for this request", ErrorCodes::INVALID_TEMPLATE_FORMAT); + writeRow(extremes, 1); + break; + case OutputPart::Rows: + writeValue(row_count, format.formats[j]); + break; + case OutputPart::RowsBeforeLimit: + writeValue(rows_before_limit, format.formats[j]); + break; + case OutputPart::TimeElapsed: + writeValue(watch.elapsedSeconds(), format.formats[j]); + break; + case OutputPart::RowsRead: + writeValue(progress.rows.load(), format.formats[j]); + break; + case OutputPart::BytesRead: + writeValue(progress.bytes.load(), format.formats[j]); + break; + default: + break; + } + writeString(format.delimiters[j + 1], ostr); + } + } -void registerOutputFormatTemplate(FormatFactory &factory) +void registerOutputFormatTemplate(FormatFactory & factory) { factory.registerOutputFormat("Template", []( - WriteBuffer &buf, - const Block &sample, - const Context & context, - const FormatSettings &settings) + WriteBuffer & buf, + const Block & sample, + const Context &, + const FormatSettings & settings) { - auto format_template = context.getSettingsRef().format_schema.toString(); - return std::make_shared(buf, sample, settings, format_template); + return std::make_shared(buf, sample, settings); }); } } diff --git a/dbms/src/Formats/TemplateBlockOutputStream.h b/dbms/src/Formats/TemplateBlockOutputStream.h index 3953d75e2cc..e4a5f8a2a2a 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.h +++ b/dbms/src/Formats/TemplateBlockOutputStream.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB @@ -22,7 +23,7 @@ public: Raw }; - TemplateBlockOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_, const String & format_template); + TemplateBlockOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_); Block getHeader() const override { return header; } void write(const Block & block) override; @@ -37,22 +38,50 @@ public: void onProgress(const Progress & progress_) override { progress.incrementPiecewiseAtomically(progress_); } private: + enum class OutputPart : size_t + { + Result, + Totals, + ExtremesMin, + ExtremesMax, + Rows, + RowsBeforeLimit, + TimeElapsed, + RowsRead, + BytesRead + }; + + struct ParsedFormat + { + std::vector delimiters; + std::vector formats; + std::vector format_idx_to_column_idx; + }; + + typedef std::function ColumnIdxGetter; + ColumnFormat stringToFormat(const String & format); - void parseFormatString(const String & s); - void serializeField(const ColumnWithTypeAndName & col, size_t row_num, ColumnFormat format); + OutputPart stringToOutputPart(const String & part); + ParsedFormat parseFormatString(const String & s, const ColumnIdxGetter & idxByName); + void writeRow(const Block & block, size_t row_num); + void serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat format); + template void writeValue(U value, ColumnFormat col_format); private: WriteBuffer & ostr; Block header; const FormatSettings settings; - std::vector delimiters; - std::vector formats; - std::vector format_idx_to_column_idx; + + ParsedFormat format; + ParsedFormat row_format; size_t rows_before_limit; Block totals; Block extremes; Progress progress; + Stopwatch watch; + + size_t row_count = 0; }; } From 79015898cf1a046cfe64894a4496465c540a511b Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 8 Apr 2019 00:30:54 +0300 Subject: [PATCH 04/43] TemplateRowInputStream --- dbms/src/Formats/FormatFactory.cpp | 5 + .../src/Formats/TemplateBlockOutputStream.cpp | 198 +++++++++--------- dbms/src/Formats/TemplateBlockOutputStream.h | 33 +-- dbms/src/Formats/TemplateRowInputStream.cpp | 126 +++++++++++ dbms/src/Formats/TemplateRowInputStream.h | 45 ++++ 5 files changed, 295 insertions(+), 112 deletions(-) create mode 100644 dbms/src/Formats/TemplateRowInputStream.cpp create mode 100644 dbms/src/Formats/TemplateRowInputStream.h diff --git a/dbms/src/Formats/FormatFactory.cpp b/dbms/src/Formats/FormatFactory.cpp index 4ed5a59a94c..fa78089b259 100644 --- a/dbms/src/Formats/FormatFactory.cpp +++ b/dbms/src/Formats/FormatFactory.cpp @@ -49,6 +49,9 @@ static FormatSettings getInputFormatSetting(const Settings & settings) format_settings.date_time_input_format = settings.date_time_input_format; format_settings.input_allow_errors_num = settings.input_format_allow_errors_num; format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio; + format_settings.template_settings.format = settings.format_schema; + format_settings.template_settings.row_format = settings.format_schema_rows; + format_settings.template_settings.row_between_delimiter = settings.format_schema_rows_between_delimiter; return format_settings; } @@ -200,6 +203,7 @@ void registerInputFormatParquet(FormatFactory & factory); void registerOutputFormatParquet(FormatFactory & factory); void registerInputFormatProtobuf(FormatFactory & factory); void registerOutputFormatProtobuf(FormatFactory & factory); +void registerInputFormatTemplate(FormatFactory & factory); void registerOutputFormatTemplate(FormatFactory &factory); void registerInputFormatProcessorNative(FormatFactory & factory); @@ -274,6 +278,7 @@ FormatFactory::FormatFactory() registerInputFormatCapnProto(*this); registerInputFormatParquet(*this); registerOutputFormatParquet(*this); + registerInputFormatTemplate(*this); registerOutputFormatTemplate(*this); registerOutputFormatMySQLWire(*this); diff --git a/dbms/src/Formats/TemplateBlockOutputStream.cpp b/dbms/src/Formats/TemplateBlockOutputStream.cpp index 1950655dce6..c43b880e349 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.cpp +++ b/dbms/src/Formats/TemplateBlockOutputStream.cpp @@ -13,12 +13,110 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT; } +ParsedTemplateFormat::ParsedTemplateFormat(const String & format_string, const ColumnIdxGetter & idxByName) +{ + enum ParserState + { + Delimiter, + Column, + Format + }; + const char * pos = format_string.c_str(); + const char * token_begin = pos; + ParserState state = Delimiter; + delimiters.emplace_back(); + for (; *pos; ++pos) + { + switch (state) + { + case Delimiter: + if (*pos == '$') + { + delimiters.back().append(token_begin, pos - token_begin); + ++pos; + if (*pos == '{') + { + token_begin = pos + 1; + state = Column; + } + else if (*pos == '$') + { + token_begin = pos; + } + else + { + throw Exception("invalid template: pos " + std::to_string(pos - format_string.c_str()) + + ": expected '{' or '$' after '$'", ErrorCodes::INVALID_TEMPLATE_FORMAT); + } + } + break; + + case Column: + if (*pos == ':') + { + size_t column_idx = idxByName(String(token_begin, pos - token_begin)); + format_idx_to_column_idx.push_back(column_idx); + token_begin = pos + 1; + state = Format; + } + else if (*pos == '}') + { + size_t column_idx = idxByName(String(token_begin, pos - token_begin)); + format_idx_to_column_idx.push_back(column_idx); + formats.push_back(ColumnFormat::Default); + delimiters.emplace_back(); + token_begin = pos + 1; + state = Delimiter; + } + break; + + case Format: + if (*pos == '}') + { + formats.push_back(stringToFormat(String(token_begin, pos - token_begin))); + token_begin = pos + 1; + delimiters.emplace_back(); + state = Delimiter; + } + } + } + if (state != Delimiter) + throw Exception("invalid template: check parentheses balance", ErrorCodes::INVALID_TEMPLATE_FORMAT); + delimiters.back().append(token_begin, pos - token_begin); +} + + +ParsedTemplateFormat::ColumnFormat ParsedTemplateFormat::stringToFormat(const String & col_format) +{ + if (col_format.empty()) + return ColumnFormat::Default; + else if (col_format == "Escaped") + return ColumnFormat::Escaped; + else if (col_format == "Quoted") + return ColumnFormat::Quoted; + else if (col_format == "JSON") + return ColumnFormat::Json; + else if (col_format == "XML") + return ColumnFormat::Xml; + else if (col_format == "Raw") + return ColumnFormat::Raw; + else + throw Exception("invalid template: unknown field format " + col_format, ErrorCodes::INVALID_TEMPLATE_FORMAT); +} + +size_t ParsedTemplateFormat::columnsCount() const +{ + return format_idx_to_column_idx.size(); +} + + + TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_) : ostr(ostr_), header(sample), settings(settings_) { static const String default_format("${result}"); const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; - format = parseFormatString(format_str, [&](const String & partName) + format = ParsedTemplateFormat(format_str, [&](const String & partName) { return static_cast(stringToOutputPart(partName)); }); @@ -36,7 +134,7 @@ TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const case OutputPart::ExtremesMax: if (format.formats[i] != ColumnFormat::Default) throw Exception("invalid template: wrong serialization type for result, totals, min or max", - ErrorCodes::INVALID_TEMPLATE_FORMAT); + ErrorCodes::INVALID_TEMPLATE_FORMAT); break; default: break; @@ -46,7 +144,7 @@ TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const if (resultIdx != 0) throw Exception("invalid template: ${result} must be the first output part", ErrorCodes::INVALID_TEMPLATE_FORMAT); - row_format = parseFormatString(settings.template_settings.row_format, [&](const String & colName) + row_format = ParsedTemplateFormat(settings.template_settings.row_format, [&](const String & colName) { return header.getPositionByName(colName); }); @@ -55,100 +153,6 @@ TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const throw Exception("invalid template: no columns specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); } - -TemplateBlockOutputStream::ParsedFormat TemplateBlockOutputStream::parseFormatString(const String & s, const ColumnIdxGetter & idxByName) -{ - enum ParserState - { - Delimiter, - Column, - Format - }; - ParsedFormat parsed_format; - const char * pos = s.c_str(); - const char * token_begin = pos; - ParserState state = Delimiter; - parsed_format.delimiters.emplace_back(); - for (; *pos; ++pos) - { - switch (state) - { - case Delimiter: - if (*pos == '$') - { - parsed_format.delimiters.back().append(token_begin, pos - token_begin); - ++pos; - if (*pos == '{') - { - token_begin = pos + 1; - state = Column; - } - else if (*pos == '$') - { - token_begin = pos; - } - else - { - throw Exception("invalid template: pos " + std::to_string(pos - s.c_str()) + - ": expected '{' or '$' after '$'", ErrorCodes::INVALID_TEMPLATE_FORMAT); - } - } - break; - - case Column: - if (*pos == ':') - { - size_t column_idx = idxByName(String(token_begin, pos - token_begin)); - parsed_format.format_idx_to_column_idx.push_back(column_idx); - token_begin = pos + 1; - state = Format; - } - else if (*pos == '}') - { - size_t column_idx = idxByName(String(token_begin, pos - token_begin)); - parsed_format.format_idx_to_column_idx.push_back(column_idx); - parsed_format.formats.push_back(ColumnFormat::Default); - parsed_format.delimiters.emplace_back(); - token_begin = pos + 1; - state = Delimiter; - } - break; - - case Format: - if (*pos == '}') - { - parsed_format.formats.push_back(stringToFormat(String(token_begin, pos - token_begin))); - token_begin = pos + 1; - parsed_format.delimiters.emplace_back(); - state = Delimiter; - } - } - } - if (state != Delimiter) - throw Exception("invalid template: check parentheses balance", ErrorCodes::INVALID_TEMPLATE_FORMAT); - parsed_format.delimiters.back().append(token_begin, pos - token_begin); - return parsed_format; -} - - -TemplateBlockOutputStream::ColumnFormat TemplateBlockOutputStream::stringToFormat(const String & col_format) -{ - if (col_format.empty()) - return ColumnFormat::Default; - else if (col_format == "Escaped") - return ColumnFormat::Escaped; - else if (col_format == "Quoted") - return ColumnFormat::Quoted; - else if (col_format == "JSON") - return ColumnFormat::Json; - else if (col_format == "XML") - return ColumnFormat::Xml; - else if (col_format == "Raw") - return ColumnFormat::Raw; - else - throw Exception("invalid template: unknown field format " + col_format, ErrorCodes::INVALID_TEMPLATE_FORMAT); -} - TemplateBlockOutputStream::OutputPart TemplateBlockOutputStream::stringToOutputPart(const String & part) { if (part == "result") diff --git a/dbms/src/Formats/TemplateBlockOutputStream.h b/dbms/src/Formats/TemplateBlockOutputStream.h index e4a5f8a2a2a..29b79979ff1 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.h +++ b/dbms/src/Formats/TemplateBlockOutputStream.h @@ -10,9 +10,8 @@ namespace DB { -class TemplateBlockOutputStream : public IBlockOutputStream +struct ParsedTemplateFormat { -public: enum class ColumnFormat { Default, @@ -22,7 +21,22 @@ public: Xml, Raw }; + std::vector delimiters; + std::vector formats; + std::vector format_idx_to_column_idx; + typedef std::function ColumnIdxGetter; + + ParsedTemplateFormat() = default; + ParsedTemplateFormat(const String & format_string, const ColumnIdxGetter & idxByName); + static ColumnFormat stringToFormat(const String & format); + size_t columnsCount() const; +}; + +class TemplateBlockOutputStream : public IBlockOutputStream +{ + using ColumnFormat = ParsedTemplateFormat::ColumnFormat; +public: TemplateBlockOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_); Block getHeader() const override { return header; } @@ -51,18 +65,7 @@ private: BytesRead }; - struct ParsedFormat - { - std::vector delimiters; - std::vector formats; - std::vector format_idx_to_column_idx; - }; - - typedef std::function ColumnIdxGetter; - - ColumnFormat stringToFormat(const String & format); OutputPart stringToOutputPart(const String & part); - ParsedFormat parseFormatString(const String & s, const ColumnIdxGetter & idxByName); void writeRow(const Block & block, size_t row_num); void serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat format); template void writeValue(U value, ColumnFormat col_format); @@ -72,8 +75,8 @@ private: Block header; const FormatSettings settings; - ParsedFormat format; - ParsedFormat row_format; + ParsedTemplateFormat format; + ParsedTemplateFormat row_format; size_t rows_before_limit; Block totals; diff --git a/dbms/src/Formats/TemplateRowInputStream.cpp b/dbms/src/Formats/TemplateRowInputStream.cpp new file mode 100644 index 00000000000..8e178cfb92b --- /dev/null +++ b/dbms/src/Formats/TemplateRowInputStream.cpp @@ -0,0 +1,126 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int INVALID_TEMPLATE_FORMAT; +} + + +TemplateRowInputStream::TemplateRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & settings_, bool ignore_spaces_) + : istr(istr_), header(header_), types(header.getDataTypes()), settings(settings_), ignore_spaces(ignore_spaces_) +{ + static const String default_format("${data}"); + const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; + format = ParsedTemplateFormat(format_str, [&](const String & partName) { + if (partName == "data") + return 0; + throw Exception("invalid template format: unknown input part " + partName, ErrorCodes::INVALID_TEMPLATE_FORMAT); + }); + + if (format.formats.size() != 1 || format.formats[0] != ColumnFormat::Default) + throw Exception("invalid template format: format_schema must be \"prefix ${data} suffix\"", ErrorCodes::INVALID_TEMPLATE_FORMAT); + + + row_format = ParsedTemplateFormat(settings.template_settings.row_format, [&](const String & colName) { + return header.getPositionByName(colName); + }); + + std::vector column_in_format(header.columns(), false); + for (size_t i = 0; i < row_format.columnsCount(); ++i) + { + size_t col_idx = row_format.format_idx_to_column_idx[i]; + if (column_in_format[col_idx]) + throw Exception("invalid template format: duplicate column " + header.getColumnsWithTypeAndName()[col_idx].name, + ErrorCodes::INVALID_TEMPLATE_FORMAT); + column_in_format[col_idx] = true; + + if (row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw) + throw Exception("invalid template format: XML and Raw deserialization is not supported", ErrorCodes::INVALID_TEMPLATE_FORMAT); + } +} + +void TemplateRowInputStream::readPrefix() +{ + skipSpaces(); + assertString(format.delimiters.front(), istr); +} + +bool TemplateRowInputStream::read(MutableColumns & columns, RowReadExtension & extra) +{ + skipSpaces(); + + // TODO check for suffix, not for EOF + if (istr.eof()) + return false; + + if (row_count) + { + assertString(settings.template_settings.row_between_delimiter, istr); + } + + extra.read_columns.assign(columns.size(), false); + + for (size_t i = 0; i < row_format.columnsCount(); ++i) + { + skipSpaces(); + assertString(row_format.delimiters[i], istr); + size_t col_idx = row_format.format_idx_to_column_idx[i]; + skipSpaces(); + deserializeField(*types[col_idx], *columns[col_idx], row_format.formats[i]); + extra.read_columns[col_idx] = true; + } + + skipSpaces(); + assertString(row_format.delimiters.back(), istr); + + for (size_t i = 0; i < columns.size(); ++i) + if (!extra.read_columns[i]) + header.getByPosition(i).type->insertDefaultInto(*columns[i]); + + ++row_count; + return true; +} + +void TemplateRowInputStream::deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format) +{ + switch (col_format) + { + case ColumnFormat::Default: + case ColumnFormat::Escaped: + type.deserializeAsTextEscaped(column, istr, settings); + break; + case ColumnFormat::Quoted: + type.deserializeAsTextQuoted(column, istr, settings); + break; + case ColumnFormat::Json: + type.deserializeAsTextJSON(column, istr, settings); + break; + default: + break; + } +} + + +void registerInputFormatTemplate(FormatFactory & factory) +{ + for (bool ignore_spaces : {false, true}) + { + factory.registerInputFormat(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [=]( + ReadBuffer & buf, + const Block & sample, + const Context &, + UInt64 max_block_size, + const FormatSettings & settings) { + return std::make_shared( + std::make_shared(buf, sample, settings, ignore_spaces), + sample, max_block_size, settings); + }); + } +} + +} diff --git a/dbms/src/Formats/TemplateRowInputStream.h b/dbms/src/Formats/TemplateRowInputStream.h new file mode 100644 index 00000000000..bab67d29d7b --- /dev/null +++ b/dbms/src/Formats/TemplateRowInputStream.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include +#include +#include +#include + + +namespace DB +{ + +class TemplateRowInputStream : public IRowInputStream +{ + using ColumnFormat = ParsedTemplateFormat::ColumnFormat; +public: + TemplateRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & settings_, bool ignore_spaces_); + + bool read(MutableColumns & columns, RowReadExtension & extra) override; + + void readPrefix() override; + + // TODO + //bool allowSyncAfterError() const override; + //void syncAfterError() override; + //String getDiagnosticInfo() override; + +private: + void deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format); + inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(istr); } + +private: + ReadBuffer & istr; + Block header; + DataTypes types; + + FormatSettings settings; + ParsedTemplateFormat format; + ParsedTemplateFormat row_format; + const bool ignore_spaces; + + size_t row_count = 0; +}; + +} From 892b67492d5ee1cbbdfbbd6e26ccf13c262c8036 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 12 Apr 2019 03:45:18 +0300 Subject: [PATCH 05/43] Checking for suffix, not for eof --- dbms/src/Formats/TemplateRowInputStream.cpp | 73 ++++++++++++--- dbms/src/Formats/TemplateRowInputStream.h | 7 +- dbms/src/IO/PeekableReadBuffer.h | 99 +++++++++++++++++++++ 3 files changed, 163 insertions(+), 16 deletions(-) create mode 100644 dbms/src/IO/PeekableReadBuffer.h diff --git a/dbms/src/Formats/TemplateRowInputStream.cpp b/dbms/src/Formats/TemplateRowInputStream.cpp index 8e178cfb92b..a85a79620b0 100644 --- a/dbms/src/Formats/TemplateRowInputStream.cpp +++ b/dbms/src/Formats/TemplateRowInputStream.cpp @@ -11,12 +11,14 @@ extern const int INVALID_TEMPLATE_FORMAT; } -TemplateRowInputStream::TemplateRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & settings_, bool ignore_spaces_) - : istr(istr_), header(header_), types(header.getDataTypes()), settings(settings_), ignore_spaces(ignore_spaces_) +TemplateRowInputStream::TemplateRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & settings_, + bool ignore_spaces_) + : buf(istr_), header(header_), types(header.getDataTypes()), settings(settings_), ignore_spaces(ignore_spaces_) { static const String default_format("${data}"); const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; - format = ParsedTemplateFormat(format_str, [&](const String & partName) { + format = ParsedTemplateFormat(format_str, [&](const String & partName) + { if (partName == "data") return 0; throw Exception("invalid template format: unknown input part " + partName, ErrorCodes::INVALID_TEMPLATE_FORMAT); @@ -26,7 +28,8 @@ TemplateRowInputStream::TemplateRowInputStream(ReadBuffer & istr_, const Block & throw Exception("invalid template format: format_schema must be \"prefix ${data} suffix\"", ErrorCodes::INVALID_TEMPLATE_FORMAT); - row_format = ParsedTemplateFormat(settings.template_settings.row_format, [&](const String & colName) { + row_format = ParsedTemplateFormat(settings.template_settings.row_format, [&](const String & colName) + { return header.getPositionByName(colName); }); @@ -47,20 +50,19 @@ TemplateRowInputStream::TemplateRowInputStream(ReadBuffer & istr_, const Block & void TemplateRowInputStream::readPrefix() { skipSpaces(); - assertString(format.delimiters.front(), istr); + assertString(format.delimiters.front(), buf); } bool TemplateRowInputStream::read(MutableColumns & columns, RowReadExtension & extra) { skipSpaces(); - // TODO check for suffix, not for EOF - if (istr.eof()) + if (checkForSuffix()) return false; if (row_count) { - assertString(settings.template_settings.row_between_delimiter, istr); + assertString(settings.template_settings.row_between_delimiter, buf); } extra.read_columns.assign(columns.size(), false); @@ -68,7 +70,7 @@ bool TemplateRowInputStream::read(MutableColumns & columns, RowReadExtension & e for (size_t i = 0; i < row_format.columnsCount(); ++i) { skipSpaces(); - assertString(row_format.delimiters[i], istr); + assertString(row_format.delimiters[i], buf); size_t col_idx = row_format.format_idx_to_column_idx[i]; skipSpaces(); deserializeField(*types[col_idx], *columns[col_idx], row_format.formats[i]); @@ -76,7 +78,7 @@ bool TemplateRowInputStream::read(MutableColumns & columns, RowReadExtension & e } skipSpaces(); - assertString(row_format.delimiters.back(), istr); + assertString(row_format.delimiters.back(), buf); for (size_t i = 0; i < columns.size(); ++i) if (!extra.read_columns[i]) @@ -92,19 +94,61 @@ void TemplateRowInputStream::deserializeField(const IDataType & type, IColumn & { case ColumnFormat::Default: case ColumnFormat::Escaped: - type.deserializeAsTextEscaped(column, istr, settings); + type.deserializeAsTextEscaped(column, buf, settings); break; case ColumnFormat::Quoted: - type.deserializeAsTextQuoted(column, istr, settings); + type.deserializeAsTextQuoted(column, buf, settings); break; case ColumnFormat::Json: - type.deserializeAsTextJSON(column, istr, settings); + type.deserializeAsTextJSON(column, buf, settings); break; default: break; } } +/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignnore_spaces == true) before EOF. +/// Otherwise returns false +bool TemplateRowInputStream::checkForSuffix() +{ + StringRef suffix(format.delimiters.back()); + if (likely(!compareSuffixPart(suffix, buf.position(), buf.available()))) + return false; + + while (buf.peekNext()) + { + BufferBase::Buffer peeked = buf.lastPeeked(); + if (likely(!compareSuffixPart(suffix, peeked.begin(), peeked.size()))) + return false; + } + return suffix.size == 0; +} + +/// Returns true if buffer contains only suffix and maybe some spaces after it +/// If there are not enough data in buffer, compares available data and removes it from reference to suffix +bool TemplateRowInputStream::compareSuffixPart(StringRef & suffix, BufferBase::Position pos, size_t available) +{ + if (suffix.size < available) + { + if (!ignore_spaces) + return false; + if (likely(suffix != StringRef(pos, suffix.size))) + return false; + suffix.size = 0; + pos += suffix.size; + BufferBase::Position end = pos + available; + while (pos != end) + if (!isWhitespaceASCII(*pos)) + return false; + } + + if (likely(StringRef(suffix.data, available) != StringRef(pos, available))) + return false; + suffix.data += available; + suffix.size -= available; + return true; +} + void registerInputFormatTemplate(FormatFactory & factory) { @@ -115,7 +159,8 @@ void registerInputFormatTemplate(FormatFactory & factory) const Block & sample, const Context &, UInt64 max_block_size, - const FormatSettings & settings) { + const FormatSettings & settings) + { return std::make_shared( std::make_shared(buf, sample, settings, ignore_spaces), sample, max_block_size, settings); diff --git a/dbms/src/Formats/TemplateRowInputStream.h b/dbms/src/Formats/TemplateRowInputStream.h index bab67d29d7b..e0cbb45fdb9 100644 --- a/dbms/src/Formats/TemplateRowInputStream.h +++ b/dbms/src/Formats/TemplateRowInputStream.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -27,10 +28,12 @@ public: private: void deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format); - inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(istr); } + inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(buf); } + bool checkForSuffix(); + bool compareSuffixPart(StringRef & suffix, BufferBase::Position pos, size_t available); private: - ReadBuffer & istr; + PeekableReadBuffer buf; Block header; DataTypes types; diff --git a/dbms/src/IO/PeekableReadBuffer.h b/dbms/src/IO/PeekableReadBuffer.h new file mode 100644 index 00000000000..51c56042d63 --- /dev/null +++ b/dbms/src/IO/PeekableReadBuffer.h @@ -0,0 +1,99 @@ +#pragma once +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int MEMORY_LIMIT_EXCEEDED; +} + +/// Allows to peek next part of data from sub-buffer without extracting it +class PeekableReadBuffer : public BufferWithOwnMemory +{ +public: + constexpr static size_t default_limit = 32 * DBMS_DEFAULT_BUFFER_SIZE; + + explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t unread_limit_ = default_limit) + : sub_buf(sub_buf_), unread_limit(unread_limit_), peeked_size(0) + { + /// Read from sub-buffer + Buffer & sub_working = sub_buf.buffer(); + BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset()); + } + + bool peekNext() + { + if (sub_buf.eof()) + return false; + + size_t offset = peeked_size ? this->offset() : 0; + size_t available = peeked_size ? sub_buf.buffer().size() : this->available(); + Position sub_buf_pos = peeked_size ? sub_buf.buffer().begin() : pos; + size_t new_size = peeked_size + available; + + if (memory.size() < new_size) + { + if (available < offset && 2 * (peeked_size - offset) <= memory.size()) + { + /// Move unread data to the beginning of own memory instead of resize own memory + peeked_size -= offset; + new_size -= offset; + memmove(memory.data(), memory.data() + offset, peeked_size); + working_buffer.resize(peeked_size); + pos = memory.data(); + offset = 0; + } + else + { + if (unread_limit < new_size) + throw DB::Exception("trying to peek too much data", ErrorCodes::MEMORY_LIMIT_EXCEEDED); + memory.resize(new_size); + } + } + + /// Save unread data from sub-buffer to own memory + memcpy(memory.data() + peeked_size, sub_buf_pos, available); + peeked_size = new_size; + /// Switch to reading from own memory (or just update size if already switched) + BufferBase::set(memory.data(), new_size, offset); + + sub_buf.position() += available; + return sub_buf.next(); + } + + Buffer & lastPeeked() + { + return sub_buf.buffer(); + } + +private: + bool nextImpl() override + { + bool res = true; + if (peeked_size) + { + /// All copied data have been read from own memory, continue reading from sub_buf + peeked_size = 0; + } + else + { + /// Load next data to sub_buf + sub_buf.position() = pos; + res = sub_buf.next(); + } + + Buffer & sub_working = sub_buf.buffer(); + /// Switch to reading from sub_buf (or just update it if already switched) + BufferBase::set(sub_working.begin(), sub_working.size(), 0); + return res; + } + + ReadBuffer & sub_buf; + const size_t unread_limit; + size_t peeked_size; +}; + +} From a931e16c6ced251adf42cd8a7b4b92961ec5b9cb Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sun, 14 Apr 2019 23:01:03 +0300 Subject: [PATCH 06/43] Diagnostic info refactored --- dbms/src/Formats/CSVRowInputStream.cpp | 200 +++--------------- dbms/src/Formats/CSVRowInputStream.h | 32 +-- .../RowInputStreamWithDiagnosticInfo.cpp | 165 +++++++++++++++ .../RowInputStreamWithDiagnosticInfo.h | 48 +++++ .../Formats/TabSeparatedRowInputStream.cpp | 175 ++------------- dbms/src/Formats/TabSeparatedRowInputStream.h | 27 +-- 6 files changed, 286 insertions(+), 361 deletions(-) create mode 100644 dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp create mode 100644 dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h diff --git a/dbms/src/Formats/CSVRowInputStream.cpp b/dbms/src/Formats/CSVRowInputStream.cpp index 07cfd4826df..0ab371b3b22 100644 --- a/dbms/src/Formats/CSVRowInputStream.cpp +++ b/dbms/src/Formats/CSVRowInputStream.cpp @@ -1,5 +1,3 @@ -#include - #include #include @@ -7,6 +5,7 @@ #include #include #include +#include namespace DB @@ -15,7 +14,6 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_DATA; - extern const int LOGICAL_ERROR; } @@ -90,7 +88,7 @@ static void skipRow(ReadBuffer & istr, const FormatSettings::CSV & settings, siz CSVRowInputStream::CSVRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, const FormatSettings & format_settings) - : istr(istr_), header(header_), with_names(with_names_), format_settings(format_settings) + : RowInputStreamWithDiagnosticInfo(istr_, header_), with_names(with_names_), format_settings(format_settings) { const auto num_columns = header.columns(); @@ -274,71 +272,7 @@ bool CSVRowInputStream::read(MutableColumns & columns, RowReadExtension & ext) return true; } - -String CSVRowInputStream::getDiagnosticInfo() -{ - if (istr.eof()) /// Buffer has gone, cannot extract information about what has been parsed. - return {}; - - WriteBufferFromOwnString out; - - MutableColumns columns = header.cloneEmptyColumns(); - - /// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer. - size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset(); - if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row) - { - out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n"; - return out.str(); - } - - size_t max_length_of_column_name = 0; - for (size_t i = 0; i < header.columns(); ++i) - if (header.safeGetByPosition(i).name.size() > max_length_of_column_name) - max_length_of_column_name = header.safeGetByPosition(i).name.size(); - - size_t max_length_of_data_type_name = 0; - for (size_t i = 0; i < header.columns(); ++i) - if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name) - max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size(); - - /// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information. - - if (pos_of_prev_row) - { - istr.position() = pos_of_prev_row; - - out << "\nRow " << (row_num - 1) << ":\n"; - if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name)) - return out.str(); - } - else - { - if (!pos_of_current_row) - { - out << "Could not print diagnostic info because parsing of data hasn't started.\n"; - return out.str(); - } - - istr.position() = pos_of_current_row; - } - - out << "\nRow " << row_num << ":\n"; - parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name); - out << "\n"; - - return out.str(); -} - - -/** gcc-7 generates wrong code with optimization level greater than 1. - * See tests: dbms/src/IO/tests/write_int.cpp - * and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh - * This is compiler bug. The bug does not present in gcc-8 and clang-8. - * Nevertheless, we don't need high optimization of this function. - */ -bool OPTIMIZE(1) CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, - WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name) +bool CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { const char delimiter = format_settings.csv.delimiter; @@ -352,100 +286,18 @@ bool OPTIMIZE(1) CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumn if (column_indexes_for_input_fields[file_column].has_value()) { - const auto & table_column = *column_indexes_for_input_fields[file_column]; - const auto & current_column_type = data_types[table_column]; - const bool is_last_file_column = - file_column + 1 == column_indexes_for_input_fields.size(); - const bool at_delimiter = *istr.position() == delimiter; - const bool at_last_column_line_end = is_last_file_column - && (*istr.position() == '\n' || *istr.position() == '\r' - || istr.eof()); - - out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ') - << "name: " << header.safeGetByPosition(table_column).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(table_column).name.size(), ' ') - << "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' '); - - if (format_settings.csv.empty_as_default - && (at_delimiter || at_last_column_line_end)) - { - columns[table_column]->insertDefault(); - } - else - { - BufferBase::Position prev_position = istr.position(); - BufferBase::Position curr_position = istr.position(); - std::exception_ptr exception; - - try - { - skipWhitespacesAndTabs(istr); - prev_position = istr.position(); - current_column_type->deserializeAsTextCSV(*columns[table_column], istr, format_settings); - curr_position = istr.position(); - skipWhitespacesAndTabs(istr); - } - catch (...) - { - exception = std::current_exception(); - } - - if (curr_position < prev_position) - throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR); - - if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type)) - { - /// An empty string instead of a value. - if (curr_position == prev_position) - { - out << "ERROR: text "; - verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out); - out << " is not like " << current_column_type->getName() << "\n"; - return false; - } - } - - out << "parsed text: "; - verbosePrintString(prev_position, curr_position, out); - - if (exception) - { - if (current_column_type->getName() == "DateTime") - out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; - else if (current_column_type->getName() == "Date") - out << "ERROR: Date must be in YYYY-MM-DD format.\n"; - else - out << "ERROR\n"; - return false; - } - - out << "\n"; - - if (current_column_type->haveMaximumSizeOfValue() - && *curr_position != '\n' && *curr_position != '\r' - && *curr_position != delimiter) - { - out << "ERROR: garbage after " << current_column_type->getName() << ": "; - verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out); - out << "\n"; - - if (current_column_type->getName() == "DateTime") - out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; - else if (current_column_type->getName() == "Date") - out << "ERROR: Date must be in YYYY-MM-DD format.\n"; - - return false; - } - } + size_t col_idx = column_indexes_for_input_fields[file_column].value(); + if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], + out, file_column)) + return false; } else { static const String skipped_column_str = ""; - out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ') - << "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ') - << "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' '); - - String tmp; - readCSVString(tmp, istr, format_settings.csv); + static const DataTypePtr skipped_column_type = std::make_shared(); + static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); + if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) + return false; } /// Delimiters @@ -509,15 +361,33 @@ void CSVRowInputStream::syncAfterError() skipToNextLineOrEOF(istr); } -void CSVRowInputStream::updateDiagnosticInfo() +void +CSVRowInputStream::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) { - ++row_num; + skipWhitespacesAndTabs(istr); + prev_pos = istr.position(); - bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row; - bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset(); + if (column_indexes_for_input_fields[input_position]) + { + const bool is_last_file_column = input_position + 1 == column_indexes_for_input_fields.size(); + const bool at_delimiter = *istr.position() == format_settings.csv.delimiter; + const bool at_last_column_line_end = is_last_file_column + && (*istr.position() == '\n' || *istr.position() == '\r' || istr.eof()); - pos_of_prev_row = pos_of_current_row; - pos_of_current_row = istr.position(); + if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end)) + column.insertDefault(); + else + type->deserializeAsTextCSV(column, istr, format_settings); + } + else + { + String tmp; + readCSVString(tmp, istr, format_settings.csv); + } + + curr_pos = istr.position(); + skipWhitespacesAndTabs(istr); } diff --git a/dbms/src/Formats/CSVRowInputStream.h b/dbms/src/Formats/CSVRowInputStream.h index b282b22570e..3eba770f0fe 100644 --- a/dbms/src/Formats/CSVRowInputStream.h +++ b/dbms/src/Formats/CSVRowInputStream.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include @@ -16,7 +16,7 @@ class ReadBuffer; /** A stream for inputting data in csv format. * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. */ -class CSVRowInputStream : public IRowInputStream +class CSVRowInputStream : public RowInputStreamWithDiagnosticInfo { public: /** with_names - in the first line the header with column names @@ -28,15 +28,10 @@ public: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; - std::string getDiagnosticInfo() override; - private: - ReadBuffer & istr; - Block header; bool with_names; - DataTypes data_types; - const FormatSettings format_settings; + DataTypes data_types; using IndexesMap = std::unordered_map; IndexesMap column_indexes_by_names; @@ -57,20 +52,13 @@ private: void addInputColumn(const String & column_name); - /// For convenient diagnostics in case of an error. - size_t row_num = 0; - - /// How many bytes were read, not counting those that are still in the buffer. - size_t bytes_read_at_start_of_buffer_on_current_row = 0; - size_t bytes_read_at_start_of_buffer_on_prev_row = 0; - - char * pos_of_current_row = nullptr; - char * pos_of_prev_row = nullptr; - - void updateDiagnosticInfo(); - - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, - WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name); + bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; + void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) override; + bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override + { + return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter; + } }; } diff --git a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp b/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp new file mode 100644 index 00000000000..6d3e42649e4 --- /dev/null +++ b/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp @@ -0,0 +1,165 @@ +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +DB::RowInputStreamWithDiagnosticInfo::RowInputStreamWithDiagnosticInfo(ReadBuffer & istr_, const Block & header_) + : istr(istr_), header(header_) +{ +} + +void DB::RowInputStreamWithDiagnosticInfo::updateDiagnosticInfo() +{ + ++row_num; + + bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row; + bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset(); + + pos_of_prev_row = pos_of_current_row; + pos_of_current_row = istr.position(); +} + +String DB::RowInputStreamWithDiagnosticInfo::getDiagnosticInfo() +{ + if (istr.eof()) /// Buffer has gone, cannot extract information about what has been parsed. + return {}; + + WriteBufferFromOwnString out; + + MutableColumns columns = header.cloneEmptyColumns(); + + /// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer. + size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset(); + if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row) + { + out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n"; + return out.str(); + } + + max_length_of_column_name = 0; + for (size_t i = 0; i < header.columns(); ++i) + if (header.safeGetByPosition(i).name.size() > max_length_of_column_name) + max_length_of_column_name = header.safeGetByPosition(i).name.size(); + + max_length_of_data_type_name = 0; + for (size_t i = 0; i < header.columns(); ++i) + if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name) + max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size(); + + /// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information. + + if (pos_of_prev_row) + { + istr.position() = pos_of_prev_row; + + out << "\nRow " << (row_num - 1) << ":\n"; + if (!parseRowAndPrintDiagnosticInfo(columns, out)) + return out.str(); + } + else + { + if (!pos_of_current_row) + { + out << "Could not print diagnostic info because parsing of data hasn't started.\n"; + return out.str(); + } + + istr.position() = pos_of_current_row; + } + + out << "\nRow " << row_num << ":\n"; + parseRowAndPrintDiagnosticInfo(columns, out); + out << "\n"; + + return out.str(); +} + +bool RowInputStreamWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, + IColumn & column, + WriteBuffer & out, + size_t input_position) +{ + out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ') + << "name: " << alignedName(col_name, max_length_of_column_name) + << "type: " << alignedName(type->getName(), max_length_of_data_type_name); + + auto prev_position = istr.position(); + auto curr_position = istr.position(); + std::exception_ptr exception; + + try + { + tryDeserializeFiled(type, column, input_position, prev_position, curr_position); + } + catch (...) + { + exception = std::current_exception(); + } + + if (curr_position < prev_position) + throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR); + + if (isNativeNumber(type) || isDateOrDateTime(type)) + { + /// An empty string instead of a value. + if (curr_position == prev_position) + { + out << "ERROR: text "; + verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out); + out << " is not like " << type->getName() << "\n"; + return false; + } + } + + out << "parsed text: "; + verbosePrintString(prev_position, curr_position, out); + + if (exception) + { + if (type->getName() == "DateTime") + out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; + else if (type->getName() == "Date") + out << "ERROR: Date must be in YYYY-MM-DD format.\n"; + else + out << "ERROR\n"; + return false; + } + + out << "\n"; + + if (type->haveMaximumSizeOfValue()) + { + if (isGarbageAfterField(input_position, curr_position)) + { + out << "ERROR: garbage after " << type->getName() << ": "; + verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out); + out << "\n"; + + if (type->getName() == "DateTime") + out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; + else if (type->getName() == "Date") + out << "ERROR: Date must be in YYYY-MM-DD format.\n"; + + return false; + } + } + + return true; +} + +String RowInputStreamWithDiagnosticInfo::alignedName(const String & name, size_t max_length) const +{ + size_t spaces_count = max_length >= name.size() ? max_length - name.size() : 0; + return name + ", " + std::string(spaces_count, ' '); +} + +} diff --git a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h b/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h new file mode 100644 index 00000000000..947975f1abd --- /dev/null +++ b/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +class RowInputStreamWithDiagnosticInfo : public IRowInputStream +{ +public: + RowInputStreamWithDiagnosticInfo(ReadBuffer & istr_, const Block & header_); + + String getDiagnosticInfo() override; + +protected: + void updateDiagnosticInfo(); + bool deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, IColumn & column, + WriteBuffer & out, size_t input_position); + String alignedName(const String & name, size_t max_length) const; + + virtual bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) = 0; + virtual void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) = 0; + virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0; + + ReadBuffer & istr; + Block header; + + /// For convenient diagnostics in case of an error. + size_t row_num = 0; + +private: + /// How many bytes were read, not counting those still in the buffer. + size_t bytes_read_at_start_of_buffer_on_current_row = 0; + size_t bytes_read_at_start_of_buffer_on_prev_row = 0; + + char * pos_of_current_row = nullptr; + char * pos_of_prev_row = nullptr; + + /// For alignment of diagnostic info. + size_t max_length_of_column_name = 0; + size_t max_length_of_data_type_name = 0; +}; + +} diff --git a/dbms/src/Formats/TabSeparatedRowInputStream.cpp b/dbms/src/Formats/TabSeparatedRowInputStream.cpp index 0c16c14e306..f0412d601c7 100644 --- a/dbms/src/Formats/TabSeparatedRowInputStream.cpp +++ b/dbms/src/Formats/TabSeparatedRowInputStream.cpp @@ -1,15 +1,11 @@ -#include - -#include - #include -#include #include #include #include #include #include +#include namespace DB @@ -18,7 +14,6 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_DATA; - extern const int LOGICAL_ERROR; } @@ -49,7 +44,7 @@ static void checkForCarriageReturn(ReadBuffer & istr) TabSeparatedRowInputStream::TabSeparatedRowInputStream( ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_, const FormatSettings & format_settings) - : istr(istr_), header(header_), with_names(with_names_), with_types(with_types_), format_settings(format_settings) + : RowInputStreamWithDiagnosticInfo(istr_, header_), with_names(with_names_), with_types(with_types_), format_settings(format_settings) { const auto num_columns = header.columns(); @@ -209,70 +204,7 @@ bool TabSeparatedRowInputStream::read(MutableColumns & columns, RowReadExtension return true; } - -String TabSeparatedRowInputStream::getDiagnosticInfo() -{ - if (istr.eof()) /// Buffer has gone, cannot extract information about what has been parsed. - return {}; - - WriteBufferFromOwnString out; - MutableColumns columns = header.cloneEmptyColumns(); - - /// It is possible to display detailed diagnostics only if the last and next to last lines are still in the read buffer. - size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset(); - if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row) - { - out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n"; - return out.str(); - } - - size_t max_length_of_column_name = 0; - for (size_t i = 0; i < header.columns(); ++i) - if (header.safeGetByPosition(i).name.size() > max_length_of_column_name) - max_length_of_column_name = header.safeGetByPosition(i).name.size(); - - size_t max_length_of_data_type_name = 0; - for (size_t i = 0; i < header.columns(); ++i) - if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name) - max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size(); - - /// Roll back the cursor to the beginning of the previous or current line and parse all over again. But now we derive detailed information. - - if (pos_of_prev_row) - { - istr.position() = pos_of_prev_row; - - out << "\nRow " << (row_num - 1) << ":\n"; - if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name)) - return out.str(); - } - else - { - if (!pos_of_current_row) - { - out << "Could not print diagnostic info because parsing of data hasn't started.\n"; - return out.str(); - } - - istr.position() = pos_of_current_row; - } - - out << "\nRow " << row_num << ":\n"; - parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name); - out << "\n"; - - return out.str(); -} - - -/** gcc-7 generates wrong code with optimization level greater than 1. - * See tests: dbms/src/IO/tests/write_int.cpp - * and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh - * This is compiler bug. The bug does not present in gcc-8 and clang-8. - * Nevertheless, we don't need high optimization of this function. - */ -bool OPTIMIZE(1) TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo( - MutableColumns & columns, WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name) +bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position) { @@ -284,84 +216,18 @@ bool OPTIMIZE(1) TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo( if (column_indexes_for_input_fields[input_position].has_value()) { - const auto & column_index = *column_indexes_for_input_fields[input_position]; - const auto & current_column_type = data_types[column_index]; - - out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ') - << "name: " << header.safeGetByPosition(column_index).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(column_index).name.size(), ' ') - << "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' '); - - auto prev_position = istr.position(); - std::exception_ptr exception; - - try - { - current_column_type->deserializeAsTextEscaped(*columns[column_index], istr, format_settings); - } - catch (...) - { - exception = std::current_exception(); - } - - auto curr_position = istr.position(); - - if (curr_position < prev_position) - throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR); - - if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type)) - { - /// An empty string instead of a value. - if (curr_position == prev_position) - { - out << "ERROR: text "; - verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out); - out << " is not like " << current_column_type->getName() << "\n"; - return false; - } - } - - out << "parsed text: "; - verbosePrintString(prev_position, curr_position, out); - - if (exception) - { - if (current_column_type->getName() == "DateTime") - out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; - else if (current_column_type->getName() == "Date") - out << "ERROR: Date must be in YYYY-MM-DD format.\n"; - else - out << "ERROR\n"; + size_t col_idx = column_indexes_for_input_fields[input_position].value(); + if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], + out, input_position)) return false; - } - - out << "\n"; - - if (current_column_type->haveMaximumSizeOfValue()) - { - if (*curr_position != '\n' && *curr_position != '\t') - { - out << "ERROR: garbage after " << current_column_type->getName() << ": "; - verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out); - out << "\n"; - - if (current_column_type->getName() == "DateTime") - out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; - else if (current_column_type->getName() == "Date") - out << "ERROR: Date must be in YYYY-MM-DD format.\n"; - - return false; - } - } } else { static const String skipped_column_str = ""; - out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ') - << "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ') - << "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' '); - - NullSink null_sink; - readEscapedStringInto(null_sink, istr); + static const DataTypePtr skipped_column_type = std::make_shared(); + static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); + if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, input_position)) + return false; } /// Delimiters @@ -434,16 +300,19 @@ void TabSeparatedRowInputStream::syncAfterError() skipToUnescapedNextLineOrEOF(istr); } - -void TabSeparatedRowInputStream::updateDiagnosticInfo() +void TabSeparatedRowInputStream::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, + ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) { - ++row_num; - - bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row; - bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset(); - - pos_of_prev_row = pos_of_current_row; - pos_of_current_row = istr.position(); + prev_pos = istr.position(); + if (column_indexes_for_input_fields[input_position]) + type->deserializeAsTextEscaped(column, istr, format_settings); + else + { + NullSink null_sink; + readEscapedStringInto(null_sink, istr); + } + curr_pos = istr.position(); } diff --git a/dbms/src/Formats/TabSeparatedRowInputStream.h b/dbms/src/Formats/TabSeparatedRowInputStream.h index 3a0ed13c1bd..91697b54b2a 100644 --- a/dbms/src/Formats/TabSeparatedRowInputStream.h +++ b/dbms/src/Formats/TabSeparatedRowInputStream.h @@ -5,7 +5,7 @@ #include #include -#include +#include namespace DB @@ -16,7 +16,7 @@ class ReadBuffer; /** A stream to input data in tsv format. */ -class TabSeparatedRowInputStream : public IRowInputStream +class TabSeparatedRowInputStream : public RowInputStreamWithDiagnosticInfo { public: /** with_names - the first line is the header with the names of the columns @@ -30,11 +30,7 @@ public: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; - std::string getDiagnosticInfo() override; - private: - ReadBuffer & istr; - Block header; bool with_names; bool with_types; const FormatSettings format_settings; @@ -53,21 +49,10 @@ private: void setupAllColumnsByTableSchema(); void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext); - /// For convenient diagnostics in case of an error. - - size_t row_num = 0; - - /// How many bytes were read, not counting those still in the buffer. - size_t bytes_read_at_start_of_buffer_on_current_row = 0; - size_t bytes_read_at_start_of_buffer_on_prev_row = 0; - - char * pos_of_current_row = nullptr; - char * pos_of_prev_row = nullptr; - - void updateDiagnosticInfo(); - - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, - WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name); + bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; + void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) override; + bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } }; } From 4e97fd697aa349bf36c0693dd61a8b5627e775c0 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 15 Apr 2019 05:45:57 +0300 Subject: [PATCH 07/43] Diagnostic info for Template --- .../src/Formats/TemplateBlockOutputStream.cpp | 19 ++++ dbms/src/Formats/TemplateBlockOutputStream.h | 1 + dbms/src/Formats/TemplateRowInputStream.cpp | 94 +++++++++++++++++-- dbms/src/Formats/TemplateRowInputStream.h | 17 ++-- 4 files changed, 118 insertions(+), 13 deletions(-) diff --git a/dbms/src/Formats/TemplateBlockOutputStream.cpp b/dbms/src/Formats/TemplateBlockOutputStream.cpp index c43b880e349..8018ba7bfdd 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.cpp +++ b/dbms/src/Formats/TemplateBlockOutputStream.cpp @@ -109,6 +109,25 @@ size_t ParsedTemplateFormat::columnsCount() const return format_idx_to_column_idx.size(); } +String ParsedTemplateFormat::formatToString(ParsedTemplateFormat::ColumnFormat format) +{ + switch (format) + { + case ColumnFormat::Default: + return "Escaped (Default)"; + case ColumnFormat::Escaped: + return "Escaped"; + case ColumnFormat::Quoted: + return "Quoted"; + case ColumnFormat::Json: + return "Json"; + case ColumnFormat::Xml: + return "Xml"; + case ColumnFormat::Raw: + return "Raw"; + } + __builtin_unreachable(); +} TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_) diff --git a/dbms/src/Formats/TemplateBlockOutputStream.h b/dbms/src/Formats/TemplateBlockOutputStream.h index 29b79979ff1..42e9ea7820b 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.h +++ b/dbms/src/Formats/TemplateBlockOutputStream.h @@ -30,6 +30,7 @@ struct ParsedTemplateFormat ParsedTemplateFormat() = default; ParsedTemplateFormat(const String & format_string, const ColumnIdxGetter & idxByName); static ColumnFormat stringToFormat(const String & format); + static String formatToString(ColumnFormat format); size_t columnsCount() const; }; diff --git a/dbms/src/Formats/TemplateRowInputStream.cpp b/dbms/src/Formats/TemplateRowInputStream.cpp index a85a79620b0..9be1eb5a993 100644 --- a/dbms/src/Formats/TemplateRowInputStream.cpp +++ b/dbms/src/Formats/TemplateRowInputStream.cpp @@ -1,6 +1,8 @@ #include #include #include +#include +#include namespace DB { @@ -13,7 +15,7 @@ extern const int INVALID_TEMPLATE_FORMAT; TemplateRowInputStream::TemplateRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & settings_, bool ignore_spaces_) - : buf(istr_), header(header_), types(header.getDataTypes()), settings(settings_), ignore_spaces(ignore_spaces_) + : RowInputStreamWithDiagnosticInfo(buf, header_), buf(istr_), settings(settings_), ignore_spaces(ignore_spaces_) { static const String default_format("${data}"); const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; @@ -60,10 +62,10 @@ bool TemplateRowInputStream::read(MutableColumns & columns, RowReadExtension & e if (checkForSuffix()) return false; - if (row_count) - { + updateDiagnosticInfo(); + + if (likely(row_num != 1)) assertString(settings.template_settings.row_between_delimiter, buf); - } extra.read_columns.assign(columns.size(), false); @@ -73,7 +75,7 @@ bool TemplateRowInputStream::read(MutableColumns & columns, RowReadExtension & e assertString(row_format.delimiters[i], buf); size_t col_idx = row_format.format_idx_to_column_idx[i]; skipSpaces(); - deserializeField(*types[col_idx], *columns[col_idx], row_format.formats[i]); + deserializeField(*data_types[col_idx], *columns[col_idx], row_format.formats[i]); extra.read_columns[col_idx] = true; } @@ -84,7 +86,6 @@ bool TemplateRowInputStream::read(MutableColumns & columns, RowReadExtension & e if (!extra.read_columns[i]) header.getByPosition(i).type->insertDefaultInto(*columns[i]); - ++row_count; return true; } @@ -149,6 +150,87 @@ bool TemplateRowInputStream::compareSuffixPart(StringRef & suffix, BufferBase::P return true; } +bool TemplateRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out, + size_t max_length_of_column_name, size_t max_length_of_data_type_name) +{ + try + { + if (likely(row_num != 1)) + assertString(settings.template_settings.row_between_delimiter, buf); + } + catch (const DB::Exception &) + { + writeErrorStringForWrongDelimiter(out, "delimiter between rows", settings.template_settings.row_between_delimiter); + + return false; + } + for (size_t i = 0; i < row_format.columnsCount(); ++i) + { + skipSpaces(); + try + { + assertString(row_format.delimiters[i], buf); + } + catch (const DB::Exception &) + { + writeErrorStringForWrongDelimiter(out, "delimiter before field " + std::to_string(i), row_format.delimiters[i]); + return false; + } + + skipSpaces(); + size_t col_idx = row_format.format_idx_to_column_idx[i]; + if (!deserializeFieldAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name, col_idx)) + { + out << "Maybe it's not possible to deserialize field " + std::to_string(i) + + " as " + ParsedTemplateFormat::formatToString(row_format.formats[i]); + return false; + } + } + + skipSpaces(); + try + { + assertString(row_format.delimiters.back(), buf); + } + catch (const DB::Exception &) + { + writeErrorStringForWrongDelimiter(out, "delimiter after last field", row_format.delimiters.back()); + return false; + } + + return true; +} + +void TemplateRowInputStream::writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim) +{ + out << "ERROR: There is no " << description << ": expected "; + verbosePrintString(delim.data(), delim.data() + delim.size(), out); + out << ", got "; + if (buf.eof()) + out << ""; + else + verbosePrintString(buf.position(), std::min(buf.position() + delim.size() + 10, buf.buffer().end()), out); + out << '\n'; +} + +void TemplateRowInputStream::tryDeserializeFiled(MutableColumns & columns, size_t col_idx, ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) +{ + prev_pos = buf.position(); + auto format_iter = std::find(row_format.format_idx_to_column_idx.cbegin(), row_format.format_idx_to_column_idx.cend(), col_idx); + if (format_iter == row_format.format_idx_to_column_idx.cend()) + throw DB::Exception("Parse error", ErrorCodes::INVALID_TEMPLATE_FORMAT); + size_t format_idx = format_iter - row_format.format_idx_to_column_idx.begin(); + deserializeField(*data_types[col_idx], *columns[col_idx], row_format.formats[format_idx]); + curr_pos = buf.position(); +} + +bool TemplateRowInputStream::isGarbageAfterField(size_t, ReadBuffer::Position) +{ + /// Garbage will be considered as wrong delimiter + return false; +} + void registerInputFormatTemplate(FormatFactory & factory) { diff --git a/dbms/src/Formats/TemplateRowInputStream.h b/dbms/src/Formats/TemplateRowInputStream.h index e0cbb45fdb9..984ded1082e 100644 --- a/dbms/src/Formats/TemplateRowInputStream.h +++ b/dbms/src/Formats/TemplateRowInputStream.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include #include @@ -11,7 +11,7 @@ namespace DB { -class TemplateRowInputStream : public IRowInputStream +class TemplateRowInputStream : public RowInputStreamWithDiagnosticInfo { using ColumnFormat = ParsedTemplateFormat::ColumnFormat; public: @@ -24,7 +24,6 @@ public: // TODO //bool allowSyncAfterError() const override; //void syncAfterError() override; - //String getDiagnosticInfo() override; private: void deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format); @@ -32,17 +31,21 @@ private: bool checkForSuffix(); bool compareSuffixPart(StringRef & suffix, BufferBase::Position pos, size_t available); + bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out, + size_t max_length_of_column_name, size_t max_length_of_data_type_name) override; + void tryDeserializeFiled(MutableColumns & columns, size_t col_idx, + ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override; + bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override; + void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim); + + private: PeekableReadBuffer buf; - Block header; - DataTypes types; FormatSettings settings; ParsedTemplateFormat format; ParsedTemplateFormat row_format; const bool ignore_spaces; - - size_t row_count = 0; }; } From c92a0f3ac5eacd93873f84da259d59d0a7a69c59 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 17 Apr 2019 21:10:24 +0300 Subject: [PATCH 08/43] Fixes --- .../RowInputStreamWithDiagnosticInfo.cpp | 12 +++--- .../RowInputStreamWithDiagnosticInfo.h | 5 ++- .../src/Formats/TemplateBlockOutputStream.cpp | 20 +++++---- dbms/src/Formats/TemplateBlockOutputStream.h | 5 ++- dbms/src/Formats/TemplateRowInputStream.cpp | 42 ++++++++++++++----- dbms/src/Formats/TemplateRowInputStream.h | 14 +++---- dbms/src/IO/PeekableReadBuffer.h | 2 + 7 files changed, 63 insertions(+), 37 deletions(-) diff --git a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp b/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp index 6d3e42649e4..45bce165019 100644 --- a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp +++ b/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp @@ -24,8 +24,8 @@ void DB::RowInputStreamWithDiagnosticInfo::updateDiagnosticInfo() bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row; bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset(); - pos_of_prev_row = pos_of_current_row; - pos_of_current_row = istr.position(); + offset_of_prev_row = offset_of_current_row; + offset_of_current_row = istr.offset(); } String DB::RowInputStreamWithDiagnosticInfo::getDiagnosticInfo() @@ -57,9 +57,9 @@ String DB::RowInputStreamWithDiagnosticInfo::getDiagnosticInfo() /// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information. - if (pos_of_prev_row) + if (offset_of_prev_row <= istr.buffer().size()) { - istr.position() = pos_of_prev_row; + istr.position() = istr.buffer().begin() + offset_of_prev_row; out << "\nRow " << (row_num - 1) << ":\n"; if (!parseRowAndPrintDiagnosticInfo(columns, out)) @@ -67,13 +67,13 @@ String DB::RowInputStreamWithDiagnosticInfo::getDiagnosticInfo() } else { - if (!pos_of_current_row) + if (istr.buffer().size() < offset_of_current_row) { out << "Could not print diagnostic info because parsing of data hasn't started.\n"; return out.str(); } - istr.position() = pos_of_current_row; + istr.position() = istr.buffer().begin() + offset_of_current_row; } out << "\nRow " << row_num << ":\n"; diff --git a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h b/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h index 947975f1abd..e0fad00a9a6 100644 --- a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h +++ b/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -37,8 +38,8 @@ private: size_t bytes_read_at_start_of_buffer_on_current_row = 0; size_t bytes_read_at_start_of_buffer_on_prev_row = 0; - char * pos_of_current_row = nullptr; - char * pos_of_prev_row = nullptr; + size_t offset_of_current_row = std::numeric_limits::max(); + size_t offset_of_prev_row = std::numeric_limits::max(); /// For alignment of diagnostic info. size_t max_length_of_column_name = 0; diff --git a/dbms/src/Formats/TemplateBlockOutputStream.cpp b/dbms/src/Formats/TemplateBlockOutputStream.cpp index 8018ba7bfdd..f26f248edc0 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.cpp +++ b/dbms/src/Formats/TemplateBlockOutputStream.cpp @@ -133,26 +133,26 @@ String ParsedTemplateFormat::formatToString(ParsedTemplateFormat::ColumnFormat f TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_) : ostr(ostr_), header(sample), settings(settings_) { - static const String default_format("${result}"); + static const String default_format("${data}"); const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; format = ParsedTemplateFormat(format_str, [&](const String & partName) { return static_cast(stringToOutputPart(partName)); }); - size_t resultIdx = format.format_idx_to_column_idx.size() + 1; + size_t dataIdx = format.format_idx_to_column_idx.size() + 1; for (size_t i = 0; i < format.format_idx_to_column_idx.size(); ++i) { switch (static_cast(format.format_idx_to_column_idx[i])) { - case OutputPart::Result: - resultIdx = i; + case OutputPart::Data: + dataIdx = i; BOOST_FALLTHROUGH; case OutputPart::Totals: case OutputPart::ExtremesMin: case OutputPart::ExtremesMax: if (format.formats[i] != ColumnFormat::Default) - throw Exception("invalid template: wrong serialization type for result, totals, min or max", + throw Exception("invalid template: wrong serialization type for data, totals, min or max", ErrorCodes::INVALID_TEMPLATE_FORMAT); break; default: @@ -160,8 +160,8 @@ TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const } } - if (resultIdx != 0) - throw Exception("invalid template: ${result} must be the first output part", ErrorCodes::INVALID_TEMPLATE_FORMAT); + if (dataIdx != 0) + throw Exception("invalid template: ${data} must be the first output part", ErrorCodes::INVALID_TEMPLATE_FORMAT); row_format = ParsedTemplateFormat(settings.template_settings.row_format, [&](const String & colName) { @@ -174,8 +174,8 @@ TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const TemplateBlockOutputStream::OutputPart TemplateBlockOutputStream::stringToOutputPart(const String & part) { - if (part == "result") - return OutputPart::Result; + if (part == "data") + return OutputPart::Data; else if (part == "totals") return OutputPart::Totals; else if (part == "min") @@ -295,6 +295,8 @@ void TemplateBlockOutputStream::writeSuffix() writeValue(row_count, format.formats[j]); break; case OutputPart::RowsBeforeLimit: + if (!rows_before_limit_set) + throw Exception("invalid template: cannot print rows_before_limit for this request", ErrorCodes::INVALID_TEMPLATE_FORMAT); writeValue(rows_before_limit, format.formats[j]); break; case OutputPart::TimeElapsed: diff --git a/dbms/src/Formats/TemplateBlockOutputStream.h b/dbms/src/Formats/TemplateBlockOutputStream.h index 42e9ea7820b..c64d70cbc18 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.h +++ b/dbms/src/Formats/TemplateBlockOutputStream.h @@ -47,7 +47,7 @@ public: void flush() override; - void setRowsBeforeLimit(size_t rows_before_limit_) override { rows_before_limit = rows_before_limit_; } + void setRowsBeforeLimit(size_t rows_before_limit_) override { rows_before_limit = rows_before_limit_; rows_before_limit_set = true; } void setTotals(const Block & totals_) override { totals = totals_; } void setExtremes(const Block & extremes_) override { extremes = extremes_; } void onProgress(const Progress & progress_) override { progress.incrementPiecewiseAtomically(progress_); } @@ -55,7 +55,7 @@ public: private: enum class OutputPart : size_t { - Result, + Data, Totals, ExtremesMin, ExtremesMax, @@ -80,6 +80,7 @@ private: ParsedTemplateFormat row_format; size_t rows_before_limit; + bool rows_before_limit_set = false; Block totals; Block extremes; Progress progress; diff --git a/dbms/src/Formats/TemplateRowInputStream.cpp b/dbms/src/Formats/TemplateRowInputStream.cpp index 9be1eb5a993..13737ca99c2 100644 --- a/dbms/src/Formats/TemplateRowInputStream.cpp +++ b/dbms/src/Formats/TemplateRowInputStream.cpp @@ -15,7 +15,8 @@ extern const int INVALID_TEMPLATE_FORMAT; TemplateRowInputStream::TemplateRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & settings_, bool ignore_spaces_) - : RowInputStreamWithDiagnosticInfo(buf, header_), buf(istr_), settings(settings_), ignore_spaces(ignore_spaces_) + : RowInputStreamWithDiagnosticInfo(buf, header_), buf(istr_), data_types(header.getDataTypes()), + settings(settings_), ignore_spaces(ignore_spaces_) { static const String default_format("${data}"); const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; @@ -141,6 +142,7 @@ bool TemplateRowInputStream::compareSuffixPart(StringRef & suffix, BufferBase::P while (pos != end) if (!isWhitespaceASCII(*pos)) return false; + return true; } if (likely(StringRef(suffix.data, available) != StringRef(pos, available))) @@ -150,8 +152,7 @@ bool TemplateRowInputStream::compareSuffixPart(StringRef & suffix, BufferBase::P return true; } -bool TemplateRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out, - size_t max_length_of_column_name, size_t max_length_of_data_type_name) +bool TemplateRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { try { @@ -179,7 +180,7 @@ bool TemplateRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & col skipSpaces(); size_t col_idx = row_format.format_idx_to_column_idx[i]; - if (!deserializeFieldAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name, col_idx)) + if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], out, i)) { out << "Maybe it's not possible to deserialize field " + std::to_string(i) + " as " + ParsedTemplateFormat::formatToString(row_format.formats[i]); @@ -213,15 +214,11 @@ void TemplateRowInputStream::writeErrorStringForWrongDelimiter(WriteBuffer & out out << '\n'; } -void TemplateRowInputStream::tryDeserializeFiled(MutableColumns & columns, size_t col_idx, ReadBuffer::Position & prev_pos, +void TemplateRowInputStream::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) { prev_pos = buf.position(); - auto format_iter = std::find(row_format.format_idx_to_column_idx.cbegin(), row_format.format_idx_to_column_idx.cend(), col_idx); - if (format_iter == row_format.format_idx_to_column_idx.cend()) - throw DB::Exception("Parse error", ErrorCodes::INVALID_TEMPLATE_FORMAT); - size_t format_idx = format_iter - row_format.format_idx_to_column_idx.begin(); - deserializeField(*data_types[col_idx], *columns[col_idx], row_format.formats[format_idx]); + deserializeField(*type, column, row_format.formats[input_position]); curr_pos = buf.position(); } @@ -231,6 +228,31 @@ bool TemplateRowInputStream::isGarbageAfterField(size_t, ReadBuffer::Position) return false; } +bool TemplateRowInputStream::allowSyncAfterError() const +{ + return !row_format.delimiters.back().empty(); +} + +void TemplateRowInputStream::syncAfterError() +{ + StringRef delim(row_format.delimiters.back()); + if (unlikely(!delim.size)) return; + while (!buf.eof()) + { + void* pos = memchr(buf.position(), *delim.data, buf.available()); + if (!pos) + { + buf.position() += buf.available(); + continue; + } + buf.position() = static_cast(pos); + while (buf.available() < delim.size && buf.peekNext()); + if (buf.available() < delim.size || delim == StringRef(buf.position(), delim.size)) + return; + ++buf.position(); + } +} + void registerInputFormatTemplate(FormatFactory & factory) { diff --git a/dbms/src/Formats/TemplateRowInputStream.h b/dbms/src/Formats/TemplateRowInputStream.h index 984ded1082e..c69c4f71067 100644 --- a/dbms/src/Formats/TemplateRowInputStream.h +++ b/dbms/src/Formats/TemplateRowInputStream.h @@ -21,9 +21,8 @@ public: void readPrefix() override; - // TODO - //bool allowSyncAfterError() const override; - //void syncAfterError() override; + bool allowSyncAfterError() const override; + void syncAfterError() override; private: void deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format); @@ -31,16 +30,15 @@ private: bool checkForSuffix(); bool compareSuffixPart(StringRef & suffix, BufferBase::Position pos, size_t available); - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out, - size_t max_length_of_column_name, size_t max_length_of_data_type_name) override; - void tryDeserializeFiled(MutableColumns & columns, size_t col_idx, - ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override; + bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; + void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) override; bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override; void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim); - private: PeekableReadBuffer buf; + DataTypes data_types; FormatSettings settings; ParsedTemplateFormat format; diff --git a/dbms/src/IO/PeekableReadBuffer.h b/dbms/src/IO/PeekableReadBuffer.h index 51c56042d63..2784ac49ba1 100644 --- a/dbms/src/IO/PeekableReadBuffer.h +++ b/dbms/src/IO/PeekableReadBuffer.h @@ -44,6 +44,7 @@ public: memmove(memory.data(), memory.data() + offset, peeked_size); working_buffer.resize(peeked_size); pos = memory.data(); + bytes += offset; offset = 0; } else @@ -57,6 +58,7 @@ public: /// Save unread data from sub-buffer to own memory memcpy(memory.data() + peeked_size, sub_buf_pos, available); peeked_size = new_size; + bytes += sub_buf_pos - sub_buf.buffer().begin(); /// Switch to reading from own memory (or just update size if already switched) BufferBase::set(memory.data(), new_size, offset); From ab9ce18fc7c4bdf6c34b4a040f7a9f86f3be5a6f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 17 Apr 2019 23:15:57 +0300 Subject: [PATCH 09/43] Added CSV (de)serialization --- dbms/src/Formats/TemplateBlockOutputStream.cpp | 7 +++++++ dbms/src/Formats/TemplateBlockOutputStream.h | 1 + dbms/src/Formats/TemplateRowInputStream.cpp | 3 +++ 3 files changed, 11 insertions(+) diff --git a/dbms/src/Formats/TemplateBlockOutputStream.cpp b/dbms/src/Formats/TemplateBlockOutputStream.cpp index f26f248edc0..3986fff8bc1 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.cpp +++ b/dbms/src/Formats/TemplateBlockOutputStream.cpp @@ -94,6 +94,8 @@ ParsedTemplateFormat::ColumnFormat ParsedTemplateFormat::stringToFormat(const St return ColumnFormat::Escaped; else if (col_format == "Quoted") return ColumnFormat::Quoted; + else if (col_format == "CSV") + return ColumnFormat::Csv; else if (col_format == "JSON") return ColumnFormat::Json; else if (col_format == "XML") @@ -119,6 +121,8 @@ String ParsedTemplateFormat::formatToString(ParsedTemplateFormat::ColumnFormat f return "Escaped"; case ColumnFormat::Quoted: return "Quoted"; + case ColumnFormat::Csv: + return "CSV"; case ColumnFormat::Json: return "Json"; case ColumnFormat::Xml: @@ -226,6 +230,9 @@ void TemplateBlockOutputStream::serializeField(const IColumn & column, const IDa case ColumnFormat::Quoted: type.serializeAsTextQuoted(column, row_num, ostr, settings); break; + case ColumnFormat::Csv: + type.serializeAsTextCSV(column, row_num, ostr, settings); + break; case ColumnFormat::Json: type.serializeAsTextJSON(column, row_num, ostr, settings); break; diff --git a/dbms/src/Formats/TemplateBlockOutputStream.h b/dbms/src/Formats/TemplateBlockOutputStream.h index c64d70cbc18..ef41e3dba2e 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.h +++ b/dbms/src/Formats/TemplateBlockOutputStream.h @@ -17,6 +17,7 @@ struct ParsedTemplateFormat Default, Escaped, Quoted, + Csv, Json, Xml, Raw diff --git a/dbms/src/Formats/TemplateRowInputStream.cpp b/dbms/src/Formats/TemplateRowInputStream.cpp index 13737ca99c2..9c1576a8260 100644 --- a/dbms/src/Formats/TemplateRowInputStream.cpp +++ b/dbms/src/Formats/TemplateRowInputStream.cpp @@ -101,6 +101,9 @@ void TemplateRowInputStream::deserializeField(const IDataType & type, IColumn & case ColumnFormat::Quoted: type.deserializeAsTextQuoted(column, buf, settings); break; + case ColumnFormat::Csv: + type.deserializeAsTextCSV(column, buf, settings); + break; case ColumnFormat::Json: type.deserializeAsTextJSON(column, buf, settings); break; From f43f7c43a51328c192fd80159c4f69ddc61dba34 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sun, 21 Apr 2019 18:37:04 +0300 Subject: [PATCH 10/43] Fixes --- dbms/src/Formats/TemplateRowInputStream.cpp | 102 +++++++++++++++----- dbms/src/Formats/TemplateRowInputStream.h | 5 + dbms/src/IO/PeekableReadBuffer.h | 77 +++++++++------ 3 files changed, 128 insertions(+), 56 deletions(-) diff --git a/dbms/src/Formats/TemplateRowInputStream.cpp b/dbms/src/Formats/TemplateRowInputStream.cpp index 9c1576a8260..5dcd7c713da 100644 --- a/dbms/src/Formats/TemplateRowInputStream.cpp +++ b/dbms/src/Formats/TemplateRowInputStream.cpp @@ -10,6 +10,8 @@ namespace DB namespace ErrorCodes { extern const int INVALID_TEMPLATE_FORMAT; +extern const int ATTEMPT_TO_READ_AFTER_EOF; +extern const int CANNOT_READ_ALL_DATA; } @@ -92,23 +94,32 @@ bool TemplateRowInputStream::read(MutableColumns & columns, RowReadExtension & e void TemplateRowInputStream::deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format) { - switch (col_format) + try { - case ColumnFormat::Default: - case ColumnFormat::Escaped: - type.deserializeAsTextEscaped(column, buf, settings); - break; - case ColumnFormat::Quoted: - type.deserializeAsTextQuoted(column, buf, settings); - break; - case ColumnFormat::Csv: - type.deserializeAsTextCSV(column, buf, settings); - break; - case ColumnFormat::Json: - type.deserializeAsTextJSON(column, buf, settings); - break; - default: - break; + switch (col_format) + { + case ColumnFormat::Default: + case ColumnFormat::Escaped: + type.deserializeAsTextEscaped(column, buf, settings); + break; + case ColumnFormat::Quoted: + type.deserializeAsTextQuoted(column, buf, settings); + break; + case ColumnFormat::Csv: + type.deserializeAsTextCSV(column, buf, settings); + break; + case ColumnFormat::Json: + type.deserializeAsTextJSON(column, buf, settings); + break; + default: + break; + } + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + throwUnexpectedEof(); + throw; } } @@ -116,6 +127,9 @@ void TemplateRowInputStream::deserializeField(const IDataType & type, IColumn & /// Otherwise returns false bool TemplateRowInputStream::checkForSuffix() { + if (unlikely(synced_after_error_at_last_row)) + return true; + StringRef suffix(format.delimiters.back()); if (likely(!compareSuffixPart(suffix, buf.position(), buf.available()))) return false; @@ -126,7 +140,10 @@ bool TemplateRowInputStream::checkForSuffix() if (likely(!compareSuffixPart(suffix, peeked.begin(), peeked.size()))) return false; } - return suffix.size == 0; + + if (suffix.size) + throwUnexpectedEof(); + return true; } /// Returns true if buffer contains only suffix and maybe some spaces after it @@ -139,11 +156,12 @@ bool TemplateRowInputStream::compareSuffixPart(StringRef & suffix, BufferBase::P return false; if (likely(suffix != StringRef(pos, suffix.size))) return false; - suffix.size = 0; - pos += suffix.size; + BufferBase::Position end = pos + available; + pos += suffix.size; + suffix.size = 0; while (pos != end) - if (!isWhitespaceASCII(*pos)) + if (!isWhitespaceASCII(*pos++)) return false; return true; } @@ -233,13 +251,33 @@ bool TemplateRowInputStream::isGarbageAfterField(size_t, ReadBuffer::Position) bool TemplateRowInputStream::allowSyncAfterError() const { - return !row_format.delimiters.back().empty(); + return !row_format.delimiters.back().empty() || !settings.template_settings.row_between_delimiter.empty(); } void TemplateRowInputStream::syncAfterError() { - StringRef delim(row_format.delimiters.back()); - if (unlikely(!delim.size)) return; + skipToNextDelimiterOrEof(row_format.delimiters.back()); + if (buf.eof()) + { + synced_after_error_at_last_row = true; + return; + } + buf.ignore(row_format.delimiters.back().size()); + + skipSpaces(); + if (checkForSuffix()) + return; + + skipToNextDelimiterOrEof(settings.template_settings.row_between_delimiter); + if (buf.eof()) + synced_after_error_at_last_row = true; +} + +/// Searches for delimiter in input stream and sets buffer position to the beginning of delimiter (if found) or EOF (if not) +void TemplateRowInputStream::skipToNextDelimiterOrEof(const String & delimiter) +{ + StringRef delim(delimiter); + if (!delim.size) return; while (!buf.eof()) { void* pos = memchr(buf.position(), *delim.data, buf.available()); @@ -248,14 +286,28 @@ void TemplateRowInputStream::syncAfterError() buf.position() += buf.available(); continue; } + buf.position() = static_cast(pos); + + /// Peek data until we can compare it with whole delim while (buf.available() < delim.size && buf.peekNext()); - if (buf.available() < delim.size || delim == StringRef(buf.position(), delim.size)) + + if (buf.available() < delim.size) + buf.position() += buf.available(); /// EOF, there is no delim + else if (delim != StringRef(buf.position(), delim.size)) + ++buf.position(); + else return; - ++buf.position(); } } +void TemplateRowInputStream::throwUnexpectedEof() +{ + throw Exception("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " + "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", + ErrorCodes::CANNOT_READ_ALL_DATA); +} + void registerInputFormatTemplate(FormatFactory & factory) { diff --git a/dbms/src/Formats/TemplateRowInputStream.h b/dbms/src/Formats/TemplateRowInputStream.h index c69c4f71067..8858ca5704e 100644 --- a/dbms/src/Formats/TemplateRowInputStream.h +++ b/dbms/src/Formats/TemplateRowInputStream.h @@ -27,7 +27,9 @@ public: private: void deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format); inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(buf); } + bool checkForSuffix(); + void throwUnexpectedEof(); bool compareSuffixPart(StringRef & suffix, BufferBase::Position pos, size_t available); bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; @@ -36,6 +38,8 @@ private: bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override; void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim); + void skipToNextDelimiterOrEof(const String& delimeter); + private: PeekableReadBuffer buf; DataTypes data_types; @@ -44,6 +48,7 @@ private: ParsedTemplateFormat format; ParsedTemplateFormat row_format; const bool ignore_spaces; + bool synced_after_error_at_last_row = false; }; } diff --git a/dbms/src/IO/PeekableReadBuffer.h b/dbms/src/IO/PeekableReadBuffer.h index 2784ac49ba1..c1c923db5fc 100644 --- a/dbms/src/IO/PeekableReadBuffer.h +++ b/dbms/src/IO/PeekableReadBuffer.h @@ -26,41 +26,27 @@ public: bool peekNext() { - if (sub_buf.eof()) - return false; - - size_t offset = peeked_size ? this->offset() : 0; - size_t available = peeked_size ? sub_buf.buffer().size() : this->available(); - Position sub_buf_pos = peeked_size ? sub_buf.buffer().begin() : pos; - size_t new_size = peeked_size + available; - - if (memory.size() < new_size) + if (!readFromOwnMemory()) { - if (available < offset && 2 * (peeked_size - offset) <= memory.size()) - { - /// Move unread data to the beginning of own memory instead of resize own memory - peeked_size -= offset; - new_size -= offset; - memmove(memory.data(), memory.data() + offset, peeked_size); - working_buffer.resize(peeked_size); - pos = memory.data(); - bytes += offset; - offset = 0; - } - else - { - if (unread_limit < new_size) - throw DB::Exception("trying to peek too much data", ErrorCodes::MEMORY_LIMIT_EXCEEDED); - memory.resize(new_size); - } + bytes += pos - sub_buf.buffer().begin(); + sub_buf.position() = pos; + } + size_t available = sub_buf.available(); + if (!available) + { + bool res = sub_buf.next(); + if (!readFromOwnMemory()) + BufferBase::set(sub_buf.buffer().begin(), sub_buf.buffer().size(), sub_buf.offset()); + return res; } + size_t offset = resizeOwnMemoryIfNecessary(available); + /// Save unread data from sub-buffer to own memory - memcpy(memory.data() + peeked_size, sub_buf_pos, available); - peeked_size = new_size; - bytes += sub_buf_pos - sub_buf.buffer().begin(); + memcpy(memory.data() + peeked_size, sub_buf.position(), available); + peeked_size += available; /// Switch to reading from own memory (or just update size if already switched) - BufferBase::set(memory.data(), new_size, offset); + BufferBase::set(memory.data(), peeked_size, offset); sub_buf.position() += available; return sub_buf.next(); @@ -75,7 +61,7 @@ private: bool nextImpl() override { bool res = true; - if (peeked_size) + if (readFromOwnMemory()) { /// All copied data have been read from own memory, continue reading from sub_buf peeked_size = 0; @@ -93,6 +79,35 @@ private: return res; } + inline bool readFromOwnMemory() const + { + return peeked_size; + } + + size_t resizeOwnMemoryIfNecessary(size_t bytes_to_append) + { + size_t offset = readFromOwnMemory() ? this->offset() : 0; + size_t new_size = peeked_size + bytes_to_append; + if (memory.size() < new_size) + { + if (bytes_to_append < offset && 2 * (peeked_size - offset) <= memory.size()) + { + /// Move unread data to the beginning of own memory instead of resize own memory + peeked_size -= offset; + memmove(memory.data(), memory.data() + offset, peeked_size); + bytes += offset; + return 0; + } + else + { + if (unread_limit < new_size) + throw DB::Exception("trying to peek too much data", ErrorCodes::MEMORY_LIMIT_EXCEEDED); + memory.resize(new_size); + } + } + return offset; + } + ReadBuffer & sub_buf; const size_t unread_limit; size_t peeked_size; From 96776c7fd528c1391764980d8c39ba89f6a0750d Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 22 Apr 2019 03:34:19 +0300 Subject: [PATCH 11/43] Docs (ru) --- docs/ru/interfaces/formats.md | 117 ++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index bc685443b0d..5442c6a0e5e 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -10,6 +10,8 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT | [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | | [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [Template](#template) | ✔ | ✔ | +| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | | [Values](#data-format-values) | ✔ | ✔ | @@ -118,6 +120,121 @@ world Этот формат также доступен под именем `TSVWithNamesAndTypes`. +## Template {#template} + +Этот формат позволяет указать произвольную форматную строку, в которую подставляются значения, сериализованные выбранным способом. + +Для этого используются настройки `format_schema`, `format_schema_row`, `format_schema_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) + +Форматная строка `format_schema_row` задаёт формат для строк таблицы и должна иметь вид: + + `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, + + где `delimiter_i` - разделители между значениями (символ `$` в разделителе экранируется как `$$`), + `column_i` - имена столбцов, значения которых должны быть выведены или считаны, + `serializeAs_i` - тип экранирования для значений соответствующего столбца. Поддерживаются следующие типы экранирования: + + - `CSV`, `JSON`, `XML` (как в одноимённых форматах) + - `Escaped` (как в `TSV`) + - `Quoted` (как в `Values`) + - `Raw` (без экранирования, как в `TSVRaw`) + + Тип экранирования для столбца можно не указывать, в таком случае используется `Escaped`. `XML` и `Raw` поддерживаются только для вывода. + + Так, в форматной строке + + `Search phrase: ${SearchPhrase:Quoted}, count: ${c}, ad price: $$${price:JSON};` + + между разделителями `Search phrase: `, `, count: `, `, ad price: $` и `;` при выводе будут подставлены (при вводе - будут ожидаться) значения столбцов `SearchPhrase`, `c` и `price`, сериализованные как `Quoted`, `Escaped` и `JSON` соответственно, например: + + `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` + + Настройка `format_schema_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. + +Форматная строка `format_schema` имеет аналогичный `format_schema_row` синтаксис и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: + + - `data` - строки с данными в формате `format_schema_row`, разделённые `format_schema_rows_between_delimiter`. Эта подстановка должна быть первой подстановкой в форматной строке. + - `totals` - строка с тотальными значениями в формате `format_schema_row` (при использовании WITH TOTALS) + - `min` - строка с минимальными значениями в формате `format_schema_row` (при настройке extremes, выставленной в 1) + - `max` - строка с максимальными значениями в формате `format_schema_row` (при настройке extremes, выставленной в 1) + - `rows` - общее количество выведенных стрчек + - `rows_before_limit` - не менее скольких строчек получилось бы, если бы не было LIMIT-а. Выводится только если запрос содержит LIMIT. В случае, если запрос содержит GROUP BY, `rows_before_limit` - точное число строк, которое получилось бы, если бы не было LIMIT-а. + - `time` - время выполнения запроса в секундах + - `rows_read` - сколько строк было прочитано при выполнении запроса + - `bytes_read` - сколько байт (несжатых) было прочитано при выполнении запроса + + У подстановок `data`, `totals`, `min` и `max` не должны быть указаны типы экранирования. Остальные подстановки - это отдельные значения, для них может быть указан любой тип экранирования. + Если строка `format_schema` пустая, то по-умолчанию используется `${data}`. + При вводе форматная строка `format_schema` должна иметь вид `some prefix ${data} some suffix` т.е. содержать единственную подстановку `data`. + + Пример вывода: +```sql +SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 +FORMAT Template +SETTINGS format_schema = ' + Search phrases + + + + ${data} +
Search phrases
Search phrase Count
+ + ${max} +
Max
+ Processed ${rows_read:XML} rows in ${time:XML} sec + +', +format_schema_rows = ' ${SearchPhrase:XML} ${с:XML} ', +format_schema_rows_between_delimiter = '\n ' +``` +```html + + Search phrases + + + + + + + + +
Search phrases
Search phrase Count
8267016
bathroom interior design 2166
yandex 1655
spring 2014 fashion 1549
freeform photos 1480
+ + +
Max
8873898
+ Processed 3095973 rows in 0.1569913 sec + + +``` + +Пример ввода: +```json +{"array": + [ + {"PageViews": 5, "UserID": "4324182021466249494", "Duration": 146, "Sign": -1}, + {"PageViews": 6, "UserID": "4324182021466249494", "Duration": 185, "Sign": 1} + ] +} +``` +```sql +cat data.json | ./clickhouse client --query "INSERT INTO UserActivity FORMAT Template SETTINGS format_schema = '{\"array\":\n [\n \${data}\n ]\n}', format_schema_rows = '{\"PageViews\": \${project:JSON}, \"UserID\": \${date:JSON}, \"Duration\": \${size:JSON}, \"Sign\": \${hits:JSON}}', format_schema_rows_between_delimiter = ',\n '" +``` +В данном примере экранирование `"` и `$` нужно, чтобы настройки корректно передались через аргумент командной строки. Без этих экранирований настройки могли бы выглядеть так: +``` +format_schema = '{"array": + [ + ${data} + ] +}', +format_schema_rows = '{"PageViews": ${PageViews:JSON}, "UserID": ${UserID:JSON}, "Duration": ${Duration:JSON}, "Sign": ${Sign:JSON}}', +format_schema_rows_between_delimiter = ',\n ' +``` +Все разделители во входных данных должны строго соответствовать разделителям в форматных строках. + +## TemplateIgnoreSpaces {#templateignorespaces} + +Отличается от формата `Template` тем, что пропускает пробельные символы между разделителями и значениями во входном потоке. При этом, если форматные строки содержат пробельные символы, эти символы будут ожидаться во входных данных. Подходит только для ввода. + ## TSKV {#tskv} Похож на TabSeparated, но выводит значения в формате name=value. Имена экранируются так же, как строки в формате TabSeparated и, дополнительно, экранируется также символ =. From 04504ccf7b2151fc7fa4c7a9966680ae48dd1d4b Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 23 Apr 2019 01:23:59 +0300 Subject: [PATCH 12/43] Docs (en) --- docs/en/interfaces/formats.md | 117 ++++++++++++++++++++++++ docs/en/operations/settings/settings.md | 7 +- docs/ru/interfaces/formats.md | 14 +-- docs/ru/operations/settings/settings.md | 8 +- 4 files changed, 131 insertions(+), 15 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 71f28263270..e4e39a587a9 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -14,6 +14,8 @@ The supported formats are: | [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | | [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [Template](#template) | ✔ | ✔ | +| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | | [Values](#data-format-values) | ✔ | ✔ | @@ -122,6 +124,121 @@ During parsing, the first and second rows are completely ignored. This format is also available under the name `TSVWithNamesAndTypes`. +## Template {#template} + +This format allows to specify a custom format string with placeholders for values with specified escaping rule. + +It uses settings `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) + +Format string `format_schema_rows` specifies rows format with the following syntax: + + `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, + + where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), + `column_i` is a name of a column whose values are to be selected or inserted, + `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: + + - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) + - `Escaped` (similarly to `TSV`) + - `Quoted` (similarly to `Values`) + - `Raw` (without escaping, similarly to `TSVRaw`) + + Escaping rule may be omitted and in this case `Escaped` will be used. `XML` and `Raw` are suitable only for output. + + So, for the following format string: + + `Search phrase: ${SearchPhrase:Quoted}, count: ${c}, ad price: $$${price:JSON};` + + the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: + + `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` + + The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one. + +Format string `format_schema` has the same syntax as `format_schema_rows` and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: + + - `data` is the rows with data in `format_schema_rows` format, separated by `format_schema_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. + - `totals` is the row with total values in `format_schema_rows` format (when using WITH TOTALS) + - `min` is the row with minimum values in `format_schema_rows` format (when extremes is set to 1) + - `max` is the row with maximum values in `format_schema_rows` format (when extremes is set to 1) + - `rows` is the total number of output rows + - `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. + - `time` is the request execution time in seconds + - `rows_read` is the number of rows have been read + - `bytes_read` is the number of bytes (uncompressed) have been read + + The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified. The remaining placeholders may have any escaping rule specified. + If the `format_schema` setting is an empty string, `${data}` is used as default value. + For insert queries `format_schema` must be like `some prefix ${data} some suffix` i.e. it must contain a single placeholder `data`. + + `Select` example: +```sql +SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 +FORMAT Template +SETTINGS format_schema = ' + Search phrases + + + + ${data} +
Search phrases
Search phrase Count
+ + ${max} +
Max
+ Processed ${rows_read:XML} rows in ${time:XML} sec + +', +format_schema_rows = ' ${SearchPhrase:XML} ${с:XML} ', +format_schema_rows_between_delimiter = '\n ' +``` +```html + + Search phrases + + + + + + + + +
Search phrases
Search phrase Count
8267016
bathroom interior design 2166
yandex 1655
spring 2014 fashion 1549
freeform photos 1480
+ + +
Max
8873898
+ Processed 3095973 rows in 0.1569913 sec + + +``` + +`Insert` example: +```json +{"array": + [ + {"PageViews": 5, "UserID": "4324182021466249494", "Duration": 146, "Sign": -1}, + {"PageViews": 6, "UserID": "4324182021466249494", "Duration": 185, "Sign": 1} + ] +} +``` +```sql +cat data.json | ./clickhouse client --query "INSERT INTO UserActivity FORMAT Template SETTINGS format_schema = '{\"array\":\n [\n \${data}\n ]\n}', format_schema_rows = '{\"PageViews\": \${project:JSON}, \"UserID\": \${date:JSON}, \"Duration\": \${size:JSON}, \"Sign\": \${hits:JSON}}', format_schema_rows_between_delimiter = ',\n '" +``` +In this example, `"` and `$` are escaped with `\` to pass settings through the command line argument correctly. The settings may look like this without escaping: +``` +format_schema = '{"array": + [ + ${data} + ] +}', +format_schema_rows = '{"PageViews": ${PageViews:JSON}, "UserID": ${UserID:JSON}, "Duration": ${Duration:JSON}, "Sign": ${Sign:JSON}}', +format_schema_rows_between_delimiter = ',\n ' +``` +All delimiters in the input data must be strictly equal to delimiters in specified format strings. + +## TemplateIgnoreSpaces {#templateignorespaces} + +Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. This format is suitable only for input. + ## TSKV {#tskv} Similar to TabSeparated, but outputs a value in name=value format. Names are escaped the same way as in TabSeparated format, and the = symbol is also escaped. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 63648d95b77..9a4879ee72d 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -72,6 +72,9 @@ Works with tables in the MergeTree family. If `force_primary_key=1`, ClickHouse checks to see if the query has a primary key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition actually reduces the amount of data to read. For more information about data ranges in MergeTree tables, see "[MergeTree](../../operations/table_engines/mergetree.md)". +## format_schema + +This parameter is useful when you are using formats that require a schema definition, such as [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) or [Template](../../interfaces/formats.md#template-template). The value depends on the format. ## fsync_metadata @@ -566,10 +569,6 @@ If a query from the same user with the same 'query_id' already exists at this ti Yandex.Metrica uses this parameter set to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn't finished yet, it should be canceled. -## schema - -This parameter is useful when you are using formats that require a schema definition, such as [Cap'n Proto](https://capnproto.org/). The value depends on the format. - ## stream_flush_interval_ms diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 5442c6a0e5e..58c98403375 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -124,9 +124,9 @@ world Этот формат позволяет указать произвольную форматную строку, в которую подставляются значения, сериализованные выбранным способом. -Для этого используются настройки `format_schema`, `format_schema_row`, `format_schema_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) +Для этого используются настройки `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) -Форматная строка `format_schema_row` задаёт формат для строк таблицы и должна иметь вид: +Форматная строка `format_schema_rows` задаёт формат для строк таблицы и должна иметь вид: `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, @@ -151,12 +151,12 @@ world Настройка `format_schema_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. -Форматная строка `format_schema` имеет аналогичный `format_schema_row` синтаксис и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: +Форматная строка `format_schema` имеет аналогичный `format_schema_rows` синтаксис и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: - - `data` - строки с данными в формате `format_schema_row`, разделённые `format_schema_rows_between_delimiter`. Эта подстановка должна быть первой подстановкой в форматной строке. - - `totals` - строка с тотальными значениями в формате `format_schema_row` (при использовании WITH TOTALS) - - `min` - строка с минимальными значениями в формате `format_schema_row` (при настройке extremes, выставленной в 1) - - `max` - строка с максимальными значениями в формате `format_schema_row` (при настройке extremes, выставленной в 1) + - `data` - строки с данными в формате `format_schema_rows`, разделённые `format_schema_rows_between_delimiter`. Эта подстановка должна быть первой подстановкой в форматной строке. + - `totals` - строка с тотальными значениями в формате `format_schema_rows` (при использовании WITH TOTALS) + - `min` - строка с минимальными значениями в формате `format_schema_rows` (при настройке extremes, выставленной в 1) + - `max` - строка с максимальными значениями в формате `format_schema_rows` (при настройке extremes, выставленной в 1) - `rows` - общее количество выведенных стрчек - `rows_before_limit` - не менее скольких строчек получилось бы, если бы не было LIMIT-а. Выводится только если запрос содержит LIMIT. В случае, если запрос содержит GROUP BY, `rows_before_limit` - точное число строк, которое получилось бы, если бы не было LIMIT-а. - `time` - время выполнения запроса в секундах diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index e5e4bad1fa6..760c7df215c 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -70,6 +70,10 @@ ClickHouse применяет настройку в тех случаях, ко При `force_primary_key=1` ClickHouse проверяет, есть ли в запросе условие на первичный ключ, которое может использоваться для отсечения диапазонов данных. Если подходящего условия нет - кидается исключение. При этом не проверяется, действительно ли условие уменьшает объём данных для чтения. Подробнее про диапазоны данных в таблицах MergeTree читайте в разделе "[MergeTree](../../operations/table_engines/mergetree.md)". +## format_schema + +Параметр применяется в том случае, когда используются форматы, требующие определения схемы, например [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) или [Template](../../interfaces/formats.md#template-template). Значение параметра зависит от формата. + ## fsync_metadata Включает или отключает [fsync](http://pubs.opengroup.org/onlinepubs/9699919799/functions/fsync.html) при записи `.sql` файлов. По умолчанию включено. @@ -493,10 +497,6 @@ ClickHouse использует этот параметр при чтении д Эта настройка, выставленная в 1, используется в Яндекс.Метрике для реализации suggest-а значений для условий сегментации. После ввода очередного символа, если старый запрос ещё не выполнился, его следует отменить. -## schema - -Параметр применяется в том случае, когда используются форматы, требующие определения схемы, например [Cap'n Proto](https://capnproto.org/). Значение параметра зависит от формата. - ## stream_flush_interval_ms Работает для таблиц со стриммингом в случае тайм-аута, или когда поток генерирует [max_insert_block_size](#settings-max_insert_block_size) строк. From 66ae943899da39a4f4209f2c7406ab6f79bd5984 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 23 Apr 2019 05:50:58 +0300 Subject: [PATCH 13/43] tests --- .../00937_template_output_format.reference | 16 ++++++++++++++++ .../00937_template_output_format.sql | 12 ++++++++++++ .../00938_template_input_format.reference | 8 ++++++++ .../0_stateless/00938_template_input_format.sh | 18 ++++++++++++++++++ 4 files changed, 54 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/00937_template_output_format.reference create mode 100644 dbms/tests/queries/0_stateless/00937_template_output_format.sql create mode 100644 dbms/tests/queries/0_stateless/00938_template_input_format.reference create mode 100755 dbms/tests/queries/0_stateless/00938_template_input_format.sh diff --git a/dbms/tests/queries/0_stateless/00937_template_output_format.reference b/dbms/tests/queries/0_stateless/00937_template_output_format.reference new file mode 100644 index 00000000000..c4cfb4ed3a4 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00937_template_output_format.reference @@ -0,0 +1,16 @@ +{prefix} +n: "123", s1: qwe,rty, s2: 'as"df\'gh', s3: "", s4: "zx +cv bn m", d: 2016-01-01, n: 123 ; +n: "456", s1: as"df\'gh, s2: '', s3: "zx\ncv\tbn m", s4: "qwe,rty", d: 2016-01-02, n: 456 ; +n: "9876543210", s1: , s2: 'zx\ncv\tbn m', s3: "qwe,rty", s4: "as""df'gh", d: 2016-01-03, n: 9876543210 ; +n: "789", s1: zx\ncv\tbn m, s2: 'qwe,rty', s3: "as\"df'gh", s4: "", d: 2016-01-04, n: 789 +------ +n: "0", s1: , s2: '', s3: "", s4: "", d: 0000-00-00, n: 0 +------ +n: "123", s1: , s2: '', s3: "", s4: "", d: 2016-01-01, n: 123 +------ +n: "9876543210", s1: zx\ncv\tbn m, s2: 'zx\ncv\tbn m', s3: "zx\ncv\tbn m", s4: "zx +cv bn m", d: 2016-01-04, n: 9876543210 +4 rows +before limit 4 +read 4 $ suffix $ \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/00937_template_output_format.sql b/dbms/tests/queries/0_stateless/00937_template_output_format.sql new file mode 100644 index 00000000000..9aed990149a --- /dev/null +++ b/dbms/tests/queries/0_stateless/00937_template_output_format.sql @@ -0,0 +1,12 @@ +DROP TABLE IF EXISTS test.template; +CREATE TABLE test.template (s1 String, s2 String, s3 String, s4 String, n UInt64, d Date) ENGINE = Memory; +INSERT INTO test.template VALUES +('qwe,rty', 'as"df''gh', '', 'zx\ncv\tbn m', 123, '2016-01-01'),('as"df''gh', '', 'zx\ncv\tbn m', 'qwe,rty', 456, '2016-01-02'),('', 'zx\ncv\tbn m', 'qwe,rty', 'as"df''gh', 9876543210, '2016-01-03'),('zx\ncv\tbn m', 'qwe,rty', 'as"df''gh', '', 789, '2016-01-04'); + +SELECT * FROM test.template WITH TOTALS LIMIT 4 FORMAT Template SETTINGS +extremes = 1, +format_schema = '{prefix} \n${data}\n------\n${totals:}\n------\n${min}\n------\n${max}\n${rows:} rows\nbefore limit ${rows_before_limit:XML}\nread ${rows_read} $$ suffix $$', +format_schema_rows = 'n:\t${n:JSON}, s1:\t${s1:Escaped}, s2:\t${s2:Quoted}, s3:\t${s3:JSON}, s4:\t${s4:CSV}, d:\t${d}, n:\t${n:Raw}\t', +format_schema_rows_between_delimiter = ';\n'; + +DROP TABLE test.template; \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/00938_template_input_format.reference b/dbms/tests/queries/0_stateless/00938_template_input_format.reference new file mode 100644 index 00000000000..3947822de3f --- /dev/null +++ b/dbms/tests/queries/0_stateless/00938_template_input_format.reference @@ -0,0 +1,8 @@ +"qwe,rty","as""df'gh","","zx +cv bn m",123,"2016-01-01" +"as""df'gh","","zx +cv bn m","qwe,rty",456,"2016-01-02" +"zx +cv bn m","qwe,rty","as""df'gh","",789,"2016-01-04" +"","zx +cv bn m","qwe,rty","as""df'gh",9876543210,"2016-01-03" diff --git a/dbms/tests/queries/0_stateless/00938_template_input_format.sh b/dbms/tests/queries/0_stateless/00938_template_input_format.sh new file mode 100755 index 00000000000..c397c901f22 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00938_template_input_format.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.template"; +$CLICKHOUSE_CLIENT --query="CREATE TABLE test.template (s1 String, s2 String, s3 String, s4 String, n UInt64, d Date) ENGINE = Memory"; + +echo "{prefix} +n: 123, s1: qwe,rty , s2: 'as\"df\\'gh', s3: \"\", s4: \"zx +cv bn m\", d: 2016-01-01 ; +n: 456, s1: as\"df\\'gh , s2: '', s3: \"zx\\ncv\\tbn m\", s4: \"qwe,rty\", d: 2016-01-02 ; +n: 9876543210, s1: , s2: 'zx\\ncv\\tbn m', s3: \"qwe,rty\", s4: \"as\"\"df'gh\", d: 2016-01-03 ; +n: 789, s1: zx\\ncv\\tbn m , s2: 'qwe,rty', s3: \"as\\\"df'gh\", s4: \"\", d: 2016-01-04 + $ suffix $" | $CLICKHOUSE_CLIENT --query="INSERT INTO test.template FORMAT Template SETTINGS format_schema = '{prefix} \n\${data}\n \$\$ suffix \$\$\n', format_schema_rows = 'n:\t\${n}, s1:\t\${s1:Escaped}\t, s2:\t\${s2:Quoted}, s3:\t\${s3:JSON}, s4:\t\${s4:CSV}, d:\t\${d}\t', format_schema_rows_between_delimiter = ';\n'"; + +$CLICKHOUSE_CLIENT --query="SELECT * FROM test.template ORDER BY n FORMAT CSV"; +$CLICKHOUSE_CLIENT --query="DROP TABLE test.template"; From 6e4efb94f077029e58b624873614a83e73e3e0cd Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 1 May 2019 23:22:09 +0300 Subject: [PATCH 14/43] OPTIMIZE(1) --- dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp b/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp index 45bce165019..fe49b22081d 100644 --- a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp +++ b/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp @@ -83,7 +83,13 @@ String DB::RowInputStreamWithDiagnosticInfo::getDiagnosticInfo() return out.str(); } -bool RowInputStreamWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, +/** gcc-7 may generate wrong code with optimization level greater than 1. + * See tests: dbms/src/IO/tests/write_int.cpp + * and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh + * This is compiler bug. The bug does not present in gcc-8 and clang-8. + * Nevertheless, we don't need high optimization of this function. + */ +bool OPTIMIZE(1) RowInputStreamWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, IColumn & column, WriteBuffer & out, size_t input_position) From 4d0a6bad5e7ca3d0c3f617fac9a7d240ff22bdc2 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sun, 12 May 2019 06:15:08 +0300 Subject: [PATCH 15/43] PeekableReadBuffer --- dbms/src/IO/PeekableReadBuffer.cpp | 271 +++++++++++++++++++++++++++++ dbms/src/IO/PeekableReadBuffer.h | 129 +++++--------- 2 files changed, 316 insertions(+), 84 deletions(-) create mode 100644 dbms/src/IO/PeekableReadBuffer.cpp diff --git a/dbms/src/IO/PeekableReadBuffer.cpp b/dbms/src/IO/PeekableReadBuffer.cpp new file mode 100644 index 00000000000..03fc8462ee5 --- /dev/null +++ b/dbms/src/IO/PeekableReadBuffer.cpp @@ -0,0 +1,271 @@ +#include + +namespace DB +{ + +PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t unread_limit_ /* = default_limit*/) + : sub_buf(sub_buf_), unread_limit(unread_limit_) +{ + /// Read from sub-buffer + Buffer & sub_working = sub_buf.buffer(); + BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset()); + + checkStateCorrect(); +} + +/// Saves unread data to own memory, so it will be possible to read it later. Loads next data to sub-buffer +bool PeekableReadBuffer::peekNext() +{ + checkStateCorrect(); + + size_t bytes_to_copy = sub_buf.available(); + if (useSubbufferOnly()) + { + /// Don't have to copy all data from sub-buffer if there is no data in own memory (checkpoint and pos are in sub-buffer) + Position copy_from = pos; + if (checkpoint) + copy_from = checkpoint; + bytes += copy_from - sub_buf.buffer().begin(); + sub_buf.position() = copy_from; + bytes_to_copy = sub_buf.available(); + if (!bytes_to_copy) + { + /// Both checkpoint and pos are at the end of sub-buffer. Just load next part of data. + bool res = sub_buf.next(); + BufferBase::set(sub_buf.buffer().begin(), sub_buf.buffer().size(), sub_buf.offset()); + if (checkpoint) + checkpoint = pos; + + checkStateCorrect(); + return res; + } + } + + resizeOwnMemoryIfNecessary(bytes_to_copy); + + /// Save unread data from sub-buffer to own memory + memcpy(memory.data() + peeked_size, sub_buf.position(), bytes_to_copy); + + /// If useSubbufferOnly() is false, then checkpoint is in own memory and it was updated in resizeOwnMemoryIfNecessary + /// Otherwise, checkpoint now at the beginning of own memory + if (checkpoint && useSubbufferOnly()) + { + checkpoint = memory.data(); + checkpoint_in_own_memory = true; + } + if (currentlyReadFromOwnMemory()) + { + /// Update buffer size + BufferBase::set(memory.data(), peeked_size + bytes_to_copy, offset()); + } + else + { + /// Switch to reading from own memory + size_t pos_offset = peeked_size + this->offset(); + if (useSubbufferOnly()) pos_offset = bytes_to_copy; + BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset); + + } + + peeked_size += bytes_to_copy; + sub_buf.position() += bytes_to_copy; + + checkStateCorrect(); + return sub_buf.next(); +} + +void PeekableReadBuffer::setCheckpoint() +{ + checkpoint_in_own_memory = currentlyReadFromOwnMemory(); + if (!checkpoint_in_own_memory) + { + /// Don't need to store unread data anymore + peeked_size = 0; + } + checkpoint = pos; +} + +void PeekableReadBuffer::dropCheckpoint() +{ + if (!currentlyReadFromOwnMemory()) + { + /// Don't need to store unread data anymore + peeked_size = 0; + } + checkpoint = nullptr; + checkpoint_in_own_memory = false; +} + +void PeekableReadBuffer::rollbackToCheckpoint() +{ + if (!checkpoint) + throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR); + else if (checkpointInOwnMemory() == currentlyReadFromOwnMemory()) + pos = checkpoint; + else /// Checkpoint is in own memory and pos is not. Switch to reading from own memory + BufferBase::set(memory.data(), peeked_size, checkpoint - memory.data()); +} + +bool PeekableReadBuffer::nextImpl() +{ + /// FIXME wrong bytes count because it can read the same data again after rollbackToCheckpoint() + /// However, changing bytes count on every call of next() (even after rollback) allows to determine if some pointers were invalidated. + checkStateCorrect(); + bool res = true; + + if (!checkpoint) + { + if (!useSubbufferOnly()) + { + /// All copied data have been read from own memory, continue reading from sub_buf + peeked_size = 0; + } + else + { + /// Load next data to sub_buf + sub_buf.position() = pos; + res = sub_buf.next(); + } + + Buffer & sub_working = sub_buf.buffer(); + /// Switch to reading from sub_buf (or just update it if already switched) + BufferBase::set(sub_working.begin(), sub_working.size(), 0); + } + else + { + if (!currentlyReadFromOwnMemory()) + res = peekNext(); + Buffer & sub_working = sub_buf.buffer(); + BufferBase::set(sub_working.begin(), sub_working.size(), 0); + } + + checkStateCorrect(); + return res; +} + +bool PeekableReadBuffer::useSubbufferOnly() const +{ + return !peeked_size; +} + +void PeekableReadBuffer::checkStateCorrect() const +{ + if (checkpoint) + { + if (checkpointInOwnMemory()) + { + if (!peeked_size) + throw DB::Exception("Checkpoint in empty own buffer", ErrorCodes::LOGICAL_ERROR); + if (currentlyReadFromOwnMemory() && pos < checkpoint) + throw DB::Exception("Current position in own buffer before checkpoint in own buffer", ErrorCodes::LOGICAL_ERROR); + } + else + { + if (peeked_size) + throw DB::Exception("Own buffer is not empty", ErrorCodes::LOGICAL_ERROR); + if (currentlyReadFromOwnMemory()) + throw DB::Exception("Current position in own buffer before checkpoint in subbuffer", ErrorCodes::LOGICAL_ERROR); + if (pos < checkpoint) + throw DB::Exception("Current position in subbuffer before checkpoint in subbuffer", ErrorCodes::LOGICAL_ERROR); + } + } + else + { + + if (!currentlyReadFromOwnMemory() && peeked_size) + throw DB::Exception("Own buffer is not empty", ErrorCodes::LOGICAL_ERROR); + } + if (currentlyReadFromOwnMemory() && !peeked_size) + throw DB::Exception("Pos in empty own buffer", ErrorCodes::LOGICAL_ERROR); +} + +size_t PeekableReadBuffer::resizeOwnMemoryIfNecessary(size_t bytes_to_append) +{ + checkStateCorrect(); + bool needUpdateCheckpoint = checkpointInOwnMemory(); + bool needUpdatePos = currentlyReadFromOwnMemory(); + size_t offset = 0; + if (needUpdateCheckpoint) + offset = checkpoint - memory.data(); + else if (needUpdatePos) + offset = this->offset(); + + size_t new_size = peeked_size + bytes_to_append; + if (memory.size() < new_size) + { + if (bytes_to_append < offset && 2 * (peeked_size - offset) <= memory.size()) + { + /// Move unread data to the beginning of own memory instead of resize own memory + peeked_size -= offset; + memmove(memory.data(), memory.data() + offset, peeked_size); + bytes += offset; + + if (needUpdateCheckpoint) + checkpoint -= offset; + if (needUpdatePos) + pos -= offset; + + checkStateCorrect(); + return 0; + } + else + { + if (unread_limit < new_size) + throw DB::Exception("trying to peek too much data", ErrorCodes::MEMORY_LIMIT_EXCEEDED); + + size_t pos_offset = pos - memory.data(); + + // TODO amortization + memory.resize(new_size); + + if (needUpdateCheckpoint) + checkpoint = memory.data() + offset; + if (needUpdatePos) + { + BufferBase::set(memory.data(), peeked_size, pos_offset); + } + } + } + + checkStateCorrect(); + return offset; +} + +PeekableReadBuffer::~PeekableReadBuffer() +{ + if (!currentlyReadFromOwnMemory()) + sub_buf.position() = pos; +} + +BufferWithOwnMemory PeekableReadBuffer::takeUnreadData() +{ + if (!currentlyReadFromOwnMemory()) + return BufferWithOwnMemory(0); + size_t unread_size = memory.data() + peeked_size - pos; + BufferWithOwnMemory unread(unread_size); + memcpy(unread.buffer().begin(), pos, unread_size); + peeked_size = 0; + checkpoint = nullptr; + checkpoint_in_own_memory = false; + BufferBase::set(sub_buf.buffer().begin(), sub_buf.buffer().size(), sub_buf.offset()); + return unread; +} + +bool PeekableReadBuffer::currentlyReadFromOwnMemory() const +{ + return working_buffer.begin() != sub_buf.buffer().begin(); +} + +bool PeekableReadBuffer::checkpointInOwnMemory() const +{ + return checkpoint_in_own_memory; +} + +void PeekableReadBuffer::assertCanBeDistructed() const +{ + if (peeked_size && pos != memory.data() + peeked_size) + throw DB::Exception("There are data, which were extracted from sub-buffer, but not from peekable buffer: " + "Cannot destruct peekable buffer correctly because tha data will be lost", ErrorCodes::LOGICAL_ERROR); +} + +} diff --git a/dbms/src/IO/PeekableReadBuffer.h b/dbms/src/IO/PeekableReadBuffer.h index c1c923db5fc..a7efc3134bb 100644 --- a/dbms/src/IO/PeekableReadBuffer.h +++ b/dbms/src/IO/PeekableReadBuffer.h @@ -7,110 +7,71 @@ namespace DB namespace ErrorCodes { -extern const int MEMORY_LIMIT_EXCEEDED; + extern const int MEMORY_LIMIT_EXCEEDED; + extern const int LOGICAL_ERROR; } -/// Allows to peek next part of data from sub-buffer without extracting it +/// Allows to peek next part of data from sub-buffer without extracting it. +/// Also allows to set checkpoint at some position in stream and come back to this position later, +/// even if next() was called. +/// Sub-buffer should not be accessed directly during the lifelime of peekable buffer. +/// If position() of peekable buffer is explicitly set to some position before checkpoint +/// (e.g. by istr.position() = prev_pos), behavior is undefined. class PeekableReadBuffer : public BufferWithOwnMemory { public: constexpr static size_t default_limit = 32 * DBMS_DEFAULT_BUFFER_SIZE; - explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t unread_limit_ = default_limit) - : sub_buf(sub_buf_), unread_limit(unread_limit_), peeked_size(0) - { - /// Read from sub-buffer - Buffer & sub_working = sub_buf.buffer(); - BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset()); - } + explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t unread_limit_ = default_limit); - bool peekNext() - { - if (!readFromOwnMemory()) - { - bytes += pos - sub_buf.buffer().begin(); - sub_buf.position() = pos; - } - size_t available = sub_buf.available(); - if (!available) - { - bool res = sub_buf.next(); - if (!readFromOwnMemory()) - BufferBase::set(sub_buf.buffer().begin(), sub_buf.buffer().size(), sub_buf.offset()); - return res; - } + /// Use takeUnreadData() to extract unread data before destruct object + ~PeekableReadBuffer() override; - size_t offset = resizeOwnMemoryIfNecessary(available); + /// Saves unread data to own memory, so it will be possible to read it later. Loads next data to sub-buffer. + /// Doesn't change checkpoint and position in stream, + /// but all pointers (such as this->buffer().end() and this->position()) may be invalidated + /// @returns false in case of EOF in sub-buffer, otherwise returns true + bool peekNext(); - /// Save unread data from sub-buffer to own memory - memcpy(memory.data() + peeked_size, sub_buf.position(), available); - peeked_size += available; - /// Switch to reading from own memory (or just update size if already switched) - BufferBase::set(memory.data(), peeked_size, offset); + Buffer & lastPeeked() { return sub_buf.buffer(); } - sub_buf.position() += available; - return sub_buf.next(); - } + /// Sets checkpoint at current position + void setCheckpoint(); - Buffer & lastPeeked() - { - return sub_buf.buffer(); - } + /// Forget checkpoint and all data between checkpoint and position + void dropCheckpoint(); + + /// Sets position at checkpoint. + /// All pointers (such as this->buffer().end()) may be invalidated + void rollbackToCheckpoint(); + + /// If position is in own memory, returns buffer with data, which were extracted from sub-buffer, + /// but not from this buffer, so the data will not be lost after destruction of this buffer. + /// If position is in sub-buffer, returns empty buffer. + BufferWithOwnMemory takeUnreadData(); + void assertCanBeDistructed() const; private: - bool nextImpl() override - { - bool res = true; - if (readFromOwnMemory()) - { - /// All copied data have been read from own memory, continue reading from sub_buf - peeked_size = 0; - } - else - { - /// Load next data to sub_buf - sub_buf.position() = pos; - res = sub_buf.next(); - } - Buffer & sub_working = sub_buf.buffer(); - /// Switch to reading from sub_buf (or just update it if already switched) - BufferBase::set(sub_working.begin(), sub_working.size(), 0); - return res; - } + bool nextImpl() override; - inline bool readFromOwnMemory() const - { - return peeked_size; - } + inline bool useSubbufferOnly() const; + inline bool currentlyReadFromOwnMemory() const; + inline bool checkpointInOwnMemory() const; + + void checkStateCorrect() const; + + /// Makes possible to append `bytes_to_append` bytes to data in own memory. + /// Updates all invalidated pointers and sizes. + /// @returns new offset of unread data in own memory + size_t resizeOwnMemoryIfNecessary(size_t bytes_to_append); - size_t resizeOwnMemoryIfNecessary(size_t bytes_to_append) - { - size_t offset = readFromOwnMemory() ? this->offset() : 0; - size_t new_size = peeked_size + bytes_to_append; - if (memory.size() < new_size) - { - if (bytes_to_append < offset && 2 * (peeked_size - offset) <= memory.size()) - { - /// Move unread data to the beginning of own memory instead of resize own memory - peeked_size -= offset; - memmove(memory.data(), memory.data() + offset, peeked_size); - bytes += offset; - return 0; - } - else - { - if (unread_limit < new_size) - throw DB::Exception("trying to peek too much data", ErrorCodes::MEMORY_LIMIT_EXCEEDED); - memory.resize(new_size); - } - } - return offset; - } ReadBuffer & sub_buf; const size_t unread_limit; - size_t peeked_size; + size_t peeked_size = 0; + Position checkpoint = nullptr; + bool checkpoint_in_own_memory = false; }; } From 23fddc8688c36d16391b9d44f666c548b9c9bb7b Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 16 May 2019 05:05:44 +0300 Subject: [PATCH 16/43] Fixes --- dbms/src/IO/PeekableReadBuffer.cpp | 16 +++++++++++----- dbms/src/IO/PeekableReadBuffer.h | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dbms/src/IO/PeekableReadBuffer.cpp b/dbms/src/IO/PeekableReadBuffer.cpp index 03fc8462ee5..8aec1009443 100644 --- a/dbms/src/IO/PeekableReadBuffer.cpp +++ b/dbms/src/IO/PeekableReadBuffer.cpp @@ -62,7 +62,13 @@ bool PeekableReadBuffer::peekNext() { /// Switch to reading from own memory size_t pos_offset = peeked_size + this->offset(); - if (useSubbufferOnly()) pos_offset = bytes_to_copy; + if (useSubbufferOnly()) + { + if (checkpoint) + pos_offset = bytes_to_copy; + else + pos_offset = 0; + } BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset); } @@ -237,13 +243,13 @@ PeekableReadBuffer::~PeekableReadBuffer() sub_buf.position() = pos; } -BufferWithOwnMemory PeekableReadBuffer::takeUnreadData() +std::shared_ptr> PeekableReadBuffer::takeUnreadData() { if (!currentlyReadFromOwnMemory()) - return BufferWithOwnMemory(0); + return std::make_shared>(0); size_t unread_size = memory.data() + peeked_size - pos; - BufferWithOwnMemory unread(unread_size); - memcpy(unread.buffer().begin(), pos, unread_size); + auto unread = std::make_shared>(unread_size); + memcpy(unread->buffer().begin(), pos, unread_size); peeked_size = 0; checkpoint = nullptr; checkpoint_in_own_memory = false; diff --git a/dbms/src/IO/PeekableReadBuffer.h b/dbms/src/IO/PeekableReadBuffer.h index a7efc3134bb..d36bb544940 100644 --- a/dbms/src/IO/PeekableReadBuffer.h +++ b/dbms/src/IO/PeekableReadBuffer.h @@ -48,7 +48,7 @@ public: /// If position is in own memory, returns buffer with data, which were extracted from sub-buffer, /// but not from this buffer, so the data will not be lost after destruction of this buffer. /// If position is in sub-buffer, returns empty buffer. - BufferWithOwnMemory takeUnreadData(); + std::shared_ptr> takeUnreadData(); void assertCanBeDistructed() const; private: From 456a8c4b1792091cb7de8b62bec65ed65f335361 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 17 May 2019 00:24:46 +0300 Subject: [PATCH 17/43] Fix buffer padding --- dbms/src/IO/PeekableReadBuffer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/IO/PeekableReadBuffer.cpp b/dbms/src/IO/PeekableReadBuffer.cpp index 8aec1009443..6d70f790b54 100644 --- a/dbms/src/IO/PeekableReadBuffer.cpp +++ b/dbms/src/IO/PeekableReadBuffer.cpp @@ -6,6 +6,7 @@ namespace DB PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t unread_limit_ /* = default_limit*/) : sub_buf(sub_buf_), unread_limit(unread_limit_) { + padded = sub_buf.isPadded(); /// Read from sub-buffer Buffer & sub_working = sub_buf.buffer(); BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset()); From 7e9c3f20222395ac72f0f8e7189ef9249b35d5d4 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 17 May 2019 04:12:32 +0300 Subject: [PATCH 18/43] PeekableReadBufferCheckpoint --- dbms/src/IO/PeekableReadBuffer.cpp | 2 +- dbms/src/IO/PeekableReadBuffer.h | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/dbms/src/IO/PeekableReadBuffer.cpp b/dbms/src/IO/PeekableReadBuffer.cpp index 6d70f790b54..f624d8b8104 100644 --- a/dbms/src/IO/PeekableReadBuffer.cpp +++ b/dbms/src/IO/PeekableReadBuffer.cpp @@ -268,7 +268,7 @@ bool PeekableReadBuffer::checkpointInOwnMemory() const return checkpoint_in_own_memory; } -void PeekableReadBuffer::assertCanBeDistructed() const +void PeekableReadBuffer::assertCanBeDestructed() const { if (peeked_size && pos != memory.data() + peeked_size) throw DB::Exception("There are data, which were extracted from sub-buffer, but not from peekable buffer: " diff --git a/dbms/src/IO/PeekableReadBuffer.h b/dbms/src/IO/PeekableReadBuffer.h index d36bb544940..e6079c4f33e 100644 --- a/dbms/src/IO/PeekableReadBuffer.h +++ b/dbms/src/IO/PeekableReadBuffer.h @@ -49,7 +49,7 @@ public: /// but not from this buffer, so the data will not be lost after destruction of this buffer. /// If position is in sub-buffer, returns empty buffer. std::shared_ptr> takeUnreadData(); - void assertCanBeDistructed() const; + void assertCanBeDestructed() const; private: @@ -59,6 +59,7 @@ private: inline bool currentlyReadFromOwnMemory() const; inline bool checkpointInOwnMemory() const; + // TODO add unit test for PeekableReadBuffer and remove this method void checkStateCorrect() const; /// Makes possible to append `bytes_to_append` bytes to data in own memory. @@ -74,4 +75,14 @@ private: bool checkpoint_in_own_memory = false; }; + +class PeekableReadBufferCheckpoint : boost::noncopyable +{ + PeekableReadBuffer & buf; +public: + explicit PeekableReadBufferCheckpoint(PeekableReadBuffer & buf_) : buf(buf_) { buf.setCheckpoint(); } + ~PeekableReadBufferCheckpoint() { buf.dropCheckpoint(); } + +}; + } From 89ae562e830e466ecfc221cdab0f8de8db219265 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 28 May 2019 02:09:35 +0300 Subject: [PATCH 19/43] Fix --- dbms/src/Formats/TemplateBlockOutputStream.cpp | 4 ++-- dbms/src/Formats/TemplateBlockOutputStream.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/src/Formats/TemplateBlockOutputStream.cpp b/dbms/src/Formats/TemplateBlockOutputStream.cpp index 3986fff8bc1..3094bb33d07 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.cpp +++ b/dbms/src/Formats/TemplateBlockOutputStream.cpp @@ -310,10 +310,10 @@ void TemplateBlockOutputStream::writeSuffix() writeValue(watch.elapsedSeconds(), format.formats[j]); break; case OutputPart::RowsRead: - writeValue(progress.rows.load(), format.formats[j]); + writeValue(progress.read_rows.load(), format.formats[j]); break; case OutputPart::BytesRead: - writeValue(progress.bytes.load(), format.formats[j]); + writeValue(progress.read_bytes.load(), format.formats[j]); break; default: break; diff --git a/dbms/src/Formats/TemplateBlockOutputStream.h b/dbms/src/Formats/TemplateBlockOutputStream.h index ef41e3dba2e..971e4c98c9a 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.h +++ b/dbms/src/Formats/TemplateBlockOutputStream.h @@ -80,7 +80,7 @@ private: ParsedTemplateFormat format; ParsedTemplateFormat row_format; - size_t rows_before_limit; + size_t rows_before_limit = 0; bool rows_before_limit_set = false; Block totals; Block extremes; From d46a27ee01e14c25857ba475f2149dbb1bda6c9c Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 22 Jul 2019 03:28:55 +0300 Subject: [PATCH 20/43] Fixes --- dbms/src/Formats/TemplateRowInputStream.cpp | 4 +++- dbms/src/Formats/TemplateRowInputStream.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dbms/src/Formats/TemplateRowInputStream.cpp b/dbms/src/Formats/TemplateRowInputStream.cpp index 5dcd7c713da..771e6238c66 100644 --- a/dbms/src/Formats/TemplateRowInputStream.cpp +++ b/dbms/src/Formats/TemplateRowInputStream.cpp @@ -318,11 +318,13 @@ void registerInputFormatTemplate(FormatFactory & factory) const Block & sample, const Context &, UInt64 max_block_size, + UInt64 rows_portion_size, + FormatFactory::ReadCallback callback, const FormatSettings & settings) { return std::make_shared( std::make_shared(buf, sample, settings, ignore_spaces), - sample, max_block_size, settings); + sample, max_block_size, rows_portion_size, callback, settings); }); } } diff --git a/dbms/src/Formats/TemplateRowInputStream.h b/dbms/src/Formats/TemplateRowInputStream.h index 8858ca5704e..c13251a52f5 100644 --- a/dbms/src/Formats/TemplateRowInputStream.h +++ b/dbms/src/Formats/TemplateRowInputStream.h @@ -29,7 +29,7 @@ private: inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(buf); } bool checkForSuffix(); - void throwUnexpectedEof(); + [[noreturn]] void throwUnexpectedEof(); bool compareSuffixPart(StringRef & suffix, BufferBase::Position pos, size_t available); bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; From 0136ef9a5fbf80a89536967b9b7733bb9e54a01b Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 23 Aug 2019 22:47:22 +0300 Subject: [PATCH 21/43] move formats to processors --- dbms/src/Formats/CSVRowInputStream.cpp | 437 ------------------ dbms/src/Formats/CSVRowInputStream.h | 71 --- dbms/src/Formats/FormatFactory.cpp | 12 +- .../Formats/TabSeparatedRowInputStream.cpp | 373 --------------- dbms/src/Formats/TabSeparatedRowInputStream.h | 58 --- .../Formats/Impl/CSVRowInputFormat.cpp | 247 +++------- .../Formats/Impl/CSVRowInputFormat.h | 36 +- .../Impl/TabSeparatedRowInputFormat.cpp | 182 ++------ .../Formats/Impl/TabSeparatedRowInputFormat.h | 25 +- .../Impl/TemplateBlockOutputFormat.cpp} | 79 ++-- .../Formats/Impl/TemplateBlockOutputFormat.h} | 37 +- .../Formats/Impl/TemplateRowInputFormat.cpp} | 56 ++- .../Formats/Impl/TemplateRowInputFormat.h} | 15 +- .../RowInputFormatWithDiagnosticInfo.cpp} | 39 +- .../RowInputFormatWithDiagnosticInfo.h} | 10 +- 15 files changed, 233 insertions(+), 1444 deletions(-) delete mode 100644 dbms/src/Formats/CSVRowInputStream.cpp delete mode 100644 dbms/src/Formats/CSVRowInputStream.h delete mode 100644 dbms/src/Formats/TabSeparatedRowInputStream.cpp delete mode 100644 dbms/src/Formats/TabSeparatedRowInputStream.h rename dbms/src/{Formats/TemplateBlockOutputStream.cpp => Processors/Formats/Impl/TemplateBlockOutputFormat.cpp} (82%) rename dbms/src/{Formats/TemplateBlockOutputStream.h => Processors/Formats/Impl/TemplateBlockOutputFormat.h} (71%) rename dbms/src/{Formats/TemplateRowInputStream.cpp => Processors/Formats/Impl/TemplateRowInputFormat.cpp} (82%) rename dbms/src/{Formats/TemplateRowInputStream.h => Processors/Formats/Impl/TemplateRowInputFormat.h} (71%) rename dbms/src/{Formats/RowInputStreamWithDiagnosticInfo.cpp => Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp} (79%) rename dbms/src/{Formats/RowInputStreamWithDiagnosticInfo.h => Processors/Formats/RowInputFormatWithDiagnosticInfo.h} (84%) diff --git a/dbms/src/Formats/CSVRowInputStream.cpp b/dbms/src/Formats/CSVRowInputStream.cpp deleted file mode 100644 index bbe8161b5fe..00000000000 --- a/dbms/src/Formats/CSVRowInputStream.cpp +++ /dev/null @@ -1,437 +0,0 @@ -#include -#include - -#include -#include -#include -#include -#include - -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int INCORRECT_DATA; -} - - -static inline void skipEndOfLine(ReadBuffer & istr) -{ - /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) - - if (*istr.position() == '\n') - { - ++istr.position(); - if (!istr.eof() && *istr.position() == '\r') - ++istr.position(); - } - else if (*istr.position() == '\r') - { - ++istr.position(); - if (!istr.eof() && *istr.position() == '\n') - ++istr.position(); - else - throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." - " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA); - } - else if (!istr.eof()) - throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA); -} - - -static inline void skipDelimiter(ReadBuffer & istr, const char delimiter, bool is_last_column) -{ - if (is_last_column) - { - if (istr.eof()) - return; - - /// we support the extra delimiter at the end of the line - if (*istr.position() == delimiter) - { - ++istr.position(); - if (istr.eof()) - return; - } - - skipEndOfLine(istr); - } - else - assertChar(delimiter, istr); -} - - -/// Skip `whitespace` symbols allowed in CSV. -static inline void skipWhitespacesAndTabs(ReadBuffer & buf) -{ - while (!buf.eof() - && (*buf.position() == ' ' - || *buf.position() == '\t')) - ++buf.position(); -} - - -static void skipRow(ReadBuffer & istr, const FormatSettings::CSV & settings, size_t num_columns) -{ - String tmp; - for (size_t i = 0; i < num_columns; ++i) - { - skipWhitespacesAndTabs(istr); - readCSVString(tmp, istr, settings); - skipWhitespacesAndTabs(istr); - - skipDelimiter(istr, settings.delimiter, i + 1 == num_columns); - } -} - - -CSVRowInputStream::CSVRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, const FormatSettings & format_settings_) - : RowInputStreamWithDiagnosticInfo(istr_, header_), with_names(with_names_), format_settings(format_settings_) -{ - const auto num_columns = header.columns(); - - data_types.resize(num_columns); - column_indexes_by_names.reserve(num_columns); - column_idx_to_nullable_column_idx.resize(num_columns); - - for (size_t i = 0; i < num_columns; ++i) - { - const auto & column_info = header.getByPosition(i); - - data_types[i] = column_info.type; - column_indexes_by_names.emplace(column_info.name, i); - - /// If input_format_null_as_default=1 we need ColumnNullable of type DataTypeNullable(nested_type) - /// to parse value as nullable before inserting it in corresponding column of not-nullable type. - /// Constructing temporary column for each row is slow, so we prepare it here - if (format_settings.csv.null_as_default && !column_info.type->isNullable() && column_info.type->canBeInsideNullable()) - { - column_idx_to_nullable_column_idx[i] = nullable_columns.size(); - nullable_types.emplace_back(std::make_shared(column_info.type)); - nullable_columns.emplace_back(nullable_types.back()->createColumn()); - } - } -} - -/// Map an input file column to a table column, based on its name. -void CSVRowInputStream::addInputColumn(const String & column_name) -{ - const auto column_it = column_indexes_by_names.find(column_name); - if (column_it == column_indexes_by_names.end()) - { - if (format_settings.skip_unknown_fields) - { - column_indexes_for_input_fields.push_back(std::nullopt); - return; - } - - throw Exception( - "Unknown field found in CSV header: '" + column_name + "' " + - "at position " + std::to_string(column_indexes_for_input_fields.size()) + - "\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed", - ErrorCodes::INCORRECT_DATA - ); - } - - const auto column_index = column_it->second; - - if (read_columns[column_index]) - throw Exception("Duplicate field found while parsing CSV header: " + column_name, ErrorCodes::INCORRECT_DATA); - - read_columns[column_index] = true; - column_indexes_for_input_fields.emplace_back(column_index); -} - -void CSVRowInputStream::readPrefix() -{ - /// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes, - /// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it. - skipBOMIfExists(istr); - - if (with_names) - { - /// This CSV file has a header row with column names. Depending on the - /// settings, use it or skip it. - if (format_settings.with_names_use_header) - { - /// Look at the file header to see which columns we have there. - /// The missing columns are filled with defaults. - read_columns.assign(header.columns(), false); - do - { - String column_name; - skipWhitespacesAndTabs(istr); - readCSVString(column_name, istr, format_settings.csv); - skipWhitespacesAndTabs(istr); - - addInputColumn(column_name); - } - while (checkChar(format_settings.csv.delimiter, istr)); - - skipDelimiter(istr, format_settings.csv.delimiter, true); - - for (size_t column = 0; column < read_columns.size(); column++) - { - if (!read_columns[column]) - { - have_always_default_columns = true; - break; - } - } - - return; - } - else - { - skipRow(istr, format_settings.csv, header.columns()); - } - } - - /// The default: map each column of the file to the column of the table with - /// the same index. - read_columns.assign(header.columns(), true); - column_indexes_for_input_fields.resize(header.columns()); - - for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i) - { - column_indexes_for_input_fields[i] = i; - } -} - -/** If you change this function, don't forget to change its counterpart - * with extended error reporting: parseRowAndPrintDiagnosticInfo(). - */ -bool CSVRowInputStream::read(MutableColumns & columns, RowReadExtension & ext) -{ - if (istr.eof()) - return false; - - updateDiagnosticInfo(); - - /// Track whether we have to fill any columns in this row with default - /// values. If not, we return an empty column mask to the caller, so that - /// it doesn't have to check it. - bool have_default_columns = have_always_default_columns; - - const auto delimiter = format_settings.csv.delimiter; - for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column) - { - const auto & table_column = column_indexes_for_input_fields[file_column]; - const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size(); - - if (table_column) - { - skipWhitespacesAndTabs(istr); - read_columns[*table_column] = readField(*columns[*table_column], data_types[*table_column], - is_last_file_column, *table_column); - if (!read_columns[*table_column]) - have_default_columns = true; - skipWhitespacesAndTabs(istr); - } - else - { - /// We never read this column from the file, just skip it. - String tmp; - readCSVString(tmp, istr, format_settings.csv); - } - - skipDelimiter(istr, delimiter, is_last_file_column); - } - - if (have_default_columns) - { - for (size_t i = 0; i < read_columns.size(); i++) - { - if (!read_columns[i]) - { - /// The column value for this row is going to be overwritten - /// with default by the caller, but the general assumption is - /// that the column size increases for each row, so we have - /// to insert something. Since we do not care about the exact - /// value, we do not have to use the default value specified by - /// the data type, and can just use IColumn::insertDefault(). - columns[i]->insertDefault(); - } - } - ext.read_columns = read_columns; - } - - return true; -} - -bool CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) -{ - const char delimiter = format_settings.csv.delimiter; - - for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column) - { - if (file_column == 0 && istr.eof()) - { - out << "\n"; - return false; - } - - if (column_indexes_for_input_fields[file_column].has_value()) - { - size_t col_idx = column_indexes_for_input_fields[file_column].value(); - if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], - out, file_column)) - return false; - } - else - { - static const String skipped_column_str = ""; - static const DataTypePtr skipped_column_type = std::make_shared(); - static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); - if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) - return false; - } - - /// Delimiters - if (file_column + 1 == column_indexes_for_input_fields.size()) - { - if (istr.eof()) - return false; - - /// we support the extra delimiter at the end of the line - if (*istr.position() == delimiter) - { - ++istr.position(); - if (istr.eof()) - break; - } - - if (!istr.eof() && *istr.position() != '\n' && *istr.position() != '\r') - { - out << "ERROR: There is no line feed. "; - verbosePrintString(istr.position(), istr.position() + 1, out); - out << " found instead.\n" - " It's like your file has more columns than expected.\n" - "And if your file have right number of columns, maybe it have unquoted string value with comma.\n"; - - return false; - } - - skipEndOfLine(istr); - } - else - { - try - { - assertChar(delimiter, istr); - } - catch (const DB::Exception &) - { - if (*istr.position() == '\n' || *istr.position() == '\r') - { - out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected." - " It's like your file has less columns than expected.\n" - "And if your file have right number of columns, maybe it have unescaped quotes in values.\n"; - } - else - { - out << "ERROR: There is no delimiter (" << delimiter << "). "; - verbosePrintString(istr.position(), istr.position() + 1, out); - out << " found instead.\n"; - } - return false; - } - } - } - - return true; -} - - -void CSVRowInputStream::syncAfterError() -{ - skipToNextLineOrEOF(istr); -} - -void -CSVRowInputStream::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) -{ - skipWhitespacesAndTabs(istr); - prev_pos = istr.position(); - - if (column_indexes_for_input_fields[input_position]) - { - const bool is_last_file_column = input_position + 1 == column_indexes_for_input_fields.size(); - if (!readField(column, type, is_last_file_column, *column_indexes_for_input_fields[input_position])) - column.insertDefault(); - } - else - { - String tmp; - readCSVString(tmp, istr, format_settings.csv); - } - - curr_pos = istr.position(); - skipWhitespacesAndTabs(istr); -} - -bool CSVRowInputStream::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx) -{ - const bool at_delimiter = !istr.eof() || *istr.position() == format_settings.csv.delimiter; - const bool at_last_column_line_end = is_last_file_column - && (istr.eof() || *istr.position() == '\n' || *istr.position() == '\r'); - - if (format_settings.csv.empty_as_default - && (at_delimiter || at_last_column_line_end)) - { - /// Treat empty unquoted column value as default value, if - /// specified in the settings. Tuple columns might seem - /// problematic, because they are never quoted but still contain - /// commas, which might be also used as delimiters. However, - /// they do not contain empty unquoted fields, so this check - /// works for tuples as well. - return false; - } - else if (column_idx_to_nullable_column_idx[column_idx]) - { - /// If value is null but type is not nullable then use default value instead. - const size_t nullable_idx = *column_idx_to_nullable_column_idx[column_idx]; - auto & tmp_col = *nullable_columns[nullable_idx]; - nullable_types[nullable_idx]->deserializeAsTextCSV(tmp_col, istr, format_settings); - Field value = tmp_col[0]; - tmp_col.popBack(1); /// do not store copy of values in memory - if (value.isNull()) - return false; - column.insert(value); - return true; - } - else - { - /// Read the column normally. - type->deserializeAsTextCSV(column, istr, format_settings); - return true; - } -} - - -void registerInputFormatCSV(FormatFactory & factory) -{ - for (bool with_names : {false, true}) - { - factory.registerInputFormat(with_names ? "CSVWithNames" : "CSV", [=]( - ReadBuffer & buf, - const Block & sample, - const Context &, - UInt64 max_block_size, - UInt64 rows_portion_size, - FormatFactory::ReadCallback callback, - const FormatSettings & settings) - { - return std::make_shared( - std::make_shared(buf, sample, with_names, settings), - sample, max_block_size, rows_portion_size, callback, settings); - }); - } -} - -} diff --git a/dbms/src/Formats/CSVRowInputStream.h b/dbms/src/Formats/CSVRowInputStream.h deleted file mode 100644 index 192bb7c60f5..00000000000 --- a/dbms/src/Formats/CSVRowInputStream.h +++ /dev/null @@ -1,71 +0,0 @@ -#pragma once - -#include -#include - -#include -#include -#include - - -namespace DB -{ - -class ReadBuffer; - -/** A stream for inputting data in csv format. - * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. - */ -class CSVRowInputStream : public RowInputStreamWithDiagnosticInfo -{ -public: - /** with_names - in the first line the header with column names - */ - CSVRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, const FormatSettings & format_settings_); - - bool read(MutableColumns & columns, RowReadExtension & ext) override; - void readPrefix() override; - bool allowSyncAfterError() const override { return true; } - void syncAfterError() override; - -private: - bool with_names; - const FormatSettings format_settings; - DataTypes data_types; - - using IndexesMap = std::unordered_map; - IndexesMap column_indexes_by_names; - - /// Maps indexes of columns in the input file to indexes of table columns - using OptionalIndexes = std::vector>; - OptionalIndexes column_indexes_for_input_fields; - - /// Tracks which colums we have read in a single read() call. - /// For columns that are never read, it is initialized to false when we - /// read the file header, and never changed afterwards. - /// For other columns, it is updated on each read() call. - std::vector read_columns; - - /// Whether we have any columns that are not read from file at all, - /// and must be always initialized with defaults. - bool have_always_default_columns = false; - - void addInputColumn(const String & column_name); - - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) override; - bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override - { - return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter; - } - - /// For setting input_format_null_as_default - DataTypes nullable_types; - MutableColumns nullable_columns; - OptionalIndexes column_idx_to_nullable_column_idx; - - bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx); -}; - -} diff --git a/dbms/src/Formats/FormatFactory.cpp b/dbms/src/Formats/FormatFactory.cpp index 7d8d6b7ed6b..8a1ad3d7bd2 100644 --- a/dbms/src/Formats/FormatFactory.cpp +++ b/dbms/src/Formats/FormatFactory.cpp @@ -226,10 +226,6 @@ void FormatFactory::registerOutputFormatProcessor(const String & name, OutputPro void registerInputFormatNative(FormatFactory & factory); void registerOutputFormatNative(FormatFactory & factory); -void registerInputFormatTabSeparated(FormatFactory & factory); -void registerInputFormatCSV(FormatFactory & factory); -void registerInputFormatTemplate(FormatFactory & factory); -void registerOutputFormatTemplate(FormatFactory &factory); void registerInputFormatProcessorNative(FormatFactory & factory); void registerOutputFormatProcessorNative(FormatFactory & factory); @@ -250,6 +246,8 @@ void registerInputFormatProcessorORC(FormatFactory & factory); void registerOutputFormatProcessorParquet(FormatFactory & factory); void registerInputFormatProcessorProtobuf(FormatFactory & factory); void registerOutputFormatProcessorProtobuf(FormatFactory & factory); +void registerInputFormatProcessorTemplate(FormatFactory & factory); +void registerOutputFormatProcessorTemplate(FormatFactory &factory); /// Output only (presentational) formats. @@ -275,10 +273,6 @@ FormatFactory::FormatFactory() { registerInputFormatNative(*this); registerOutputFormatNative(*this); - registerInputFormatTabSeparated(*this); - registerInputFormatCSV(*this); - registerInputFormatTemplate(*this); - registerOutputFormatTemplate(*this); registerOutputFormatProcessorJSONEachRowWithProgress(*this); @@ -302,6 +296,8 @@ FormatFactory::FormatFactory() registerInputFormatProcessorORC(*this); registerInputFormatProcessorParquet(*this); registerOutputFormatProcessorParquet(*this); + registerInputFormatProcessorTemplate(*this); + registerOutputFormatProcessorTemplate(*this); registerOutputFormatNull(*this); diff --git a/dbms/src/Formats/TabSeparatedRowInputStream.cpp b/dbms/src/Formats/TabSeparatedRowInputStream.cpp deleted file mode 100644 index 0d8491bbc50..00000000000 --- a/dbms/src/Formats/TabSeparatedRowInputStream.cpp +++ /dev/null @@ -1,373 +0,0 @@ -#include -#include - -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int INCORRECT_DATA; -} - - -static void skipTSVRow(ReadBuffer & istr, const size_t num_columns) -{ - NullSink null_sink; - - for (size_t i = 0; i < num_columns; ++i) - { - readEscapedStringInto(null_sink, istr); - assertChar(i == num_columns - 1 ? '\n' : '\t', istr); - } -} - - -/** Check for a common error case - usage of Windows line feed. - */ -static void checkForCarriageReturn(ReadBuffer & istr) -{ - if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r')) - throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row." - "\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format." - " You must transform your file to Unix format." - "\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r.", - ErrorCodes::INCORRECT_DATA); -} - - -TabSeparatedRowInputStream::TabSeparatedRowInputStream( - ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputStreamWithDiagnosticInfo(istr_, header_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) -{ - const auto num_columns = header.columns(); - - data_types.resize(num_columns); - column_indexes_by_names.reserve(num_columns); - - for (size_t i = 0; i < num_columns; ++i) - { - const auto & column_info = header.getByPosition(i); - - data_types[i] = column_info.type; - column_indexes_by_names.emplace(column_info.name, i); - } - - column_indexes_for_input_fields.reserve(num_columns); - read_columns.assign(num_columns, false); -} - - -void TabSeparatedRowInputStream::setupAllColumnsByTableSchema() -{ - read_columns.assign(header.columns(), true); - column_indexes_for_input_fields.resize(header.columns()); - - for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i) - column_indexes_for_input_fields[i] = i; -} - - -void TabSeparatedRowInputStream::addInputColumn(const String & column_name) -{ - const auto column_it = column_indexes_by_names.find(column_name); - if (column_it == column_indexes_by_names.end()) - { - if (format_settings.skip_unknown_fields) - { - column_indexes_for_input_fields.push_back(std::nullopt); - return; - } - - throw Exception( - "Unknown field found in TSV header: '" + column_name + "' " + - "at position " + std::to_string(column_indexes_for_input_fields.size()) + - "\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed", - ErrorCodes::INCORRECT_DATA - ); - } - - const auto column_index = column_it->second; - - if (read_columns[column_index]) - throw Exception("Duplicate field found while parsing TSV header: " + column_name, ErrorCodes::INCORRECT_DATA); - - read_columns[column_index] = true; - column_indexes_for_input_fields.emplace_back(column_index); -} - - -void TabSeparatedRowInputStream::fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension) -{ - /// It is safe to memorize this on the first run - the format guarantees this does not change - if (unlikely(row_num == 1)) - { - columns_to_fill_with_default_values.clear(); - for (size_t index = 0; index < read_columns.size(); ++index) - if (read_columns[index] == 0) - columns_to_fill_with_default_values.push_back(index); - } - - for (const auto column_index : columns_to_fill_with_default_values) - data_types[column_index]->insertDefaultInto(*columns[column_index]); - - row_read_extension.read_columns = read_columns; -} - - -void TabSeparatedRowInputStream::readPrefix() -{ - if (with_names || with_types) - { - /// In this format, we assume that column name or type cannot contain BOM, - /// so, if format has header, - /// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it. - skipBOMIfExists(istr); - } - - if (with_names) - { - if (format_settings.with_names_use_header) - { - String column_name; - do - { - readEscapedString(column_name, istr); - addInputColumn(column_name); - } - while (checkChar('\t', istr)); - - if (!istr.eof()) - { - checkForCarriageReturn(istr); - assertChar('\n', istr); - } - } - else - { - setupAllColumnsByTableSchema(); - skipTSVRow(istr, column_indexes_for_input_fields.size()); - } - } - else - setupAllColumnsByTableSchema(); - - if (with_types) - { - skipTSVRow(istr, column_indexes_for_input_fields.size()); - } -} - - -bool TabSeparatedRowInputStream::read(MutableColumns & columns, RowReadExtension & ext) -{ - if (istr.eof()) - return false; - - updateDiagnosticInfo(); - - for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position) - { - const auto & column_index = column_indexes_for_input_fields[input_position]; - if (column_index) - { - data_types[*column_index]->deserializeAsTextEscaped(*columns[*column_index], istr, format_settings); - } - else - { - NullSink null_sink; - readEscapedStringInto(null_sink, istr); - } - - /// skip separators - if (input_position + 1 < column_indexes_for_input_fields.size()) - { - assertChar('\t', istr); - } - else if (!istr.eof()) - { - if (unlikely(row_num == 1)) - checkForCarriageReturn(istr); - - assertChar('\n', istr); - } - } - - fillUnreadColumnsWithDefaults(columns, ext); - - return true; -} - -bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) -{ - for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position) - { - if (input_position == 0 && istr.eof()) - { - out << "\n"; - return false; - } - - if (column_indexes_for_input_fields[input_position].has_value()) - { - size_t col_idx = column_indexes_for_input_fields[input_position].value(); - if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], - out, input_position)) - return false; - } - else - { - static const String skipped_column_str = ""; - static const DataTypePtr skipped_column_type = std::make_shared(); - static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); - if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, input_position)) - return false; - } - - /// Delimiters - if (input_position + 1 == column_indexes_for_input_fields.size()) - { - if (!istr.eof()) - { - try - { - assertChar('\n', istr); - } - catch (const DB::Exception &) - { - if (*istr.position() == '\t') - { - out << "ERROR: Tab found where line feed is expected." - " It's like your file has more columns than expected.\n" - "And if your file have right number of columns, maybe it have unescaped tab in value.\n"; - } - else if (*istr.position() == '\r') - { - out << "ERROR: Carriage return found where line feed is expected." - " It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n"; - } - else - { - out << "ERROR: There is no line feed. "; - verbosePrintString(istr.position(), istr.position() + 1, out); - out << " found instead.\n"; - } - return false; - } - } - } - else - { - try - { - assertChar('\t', istr); - } - catch (const DB::Exception &) - { - if (*istr.position() == '\n') - { - out << "ERROR: Line feed found where tab is expected." - " It's like your file has less columns than expected.\n" - "And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n"; - } - else if (*istr.position() == '\r') - { - out << "ERROR: Carriage return found where tab is expected.\n"; - } - else - { - out << "ERROR: There is no tab. "; - verbosePrintString(istr.position(), istr.position() + 1, out); - out << " found instead.\n"; - } - return false; - } - } - } - - return true; -} - - -void TabSeparatedRowInputStream::syncAfterError() -{ - skipToUnescapedNextLineOrEOF(istr); -} - -void TabSeparatedRowInputStream::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, - ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) -{ - prev_pos = istr.position(); - if (column_indexes_for_input_fields[input_position]) - type->deserializeAsTextEscaped(column, istr, format_settings); - else - { - NullSink null_sink; - readEscapedStringInto(null_sink, istr); - } - curr_pos = istr.position(); -} - - -void registerInputFormatTabSeparated(FormatFactory & factory) -{ - for (auto name : {"TabSeparated", "TSV"}) - { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - const Context &, - UInt64 max_block_size, - UInt64 rows_portion_size, - FormatFactory::ReadCallback callback, - const FormatSettings & settings) - { - return std::make_shared( - std::make_shared(buf, sample, false, false, settings), - sample, max_block_size, rows_portion_size, callback, settings); - }); - } - - for (auto name : {"TabSeparatedWithNames", "TSVWithNames"}) - { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - const Context &, - UInt64 max_block_size, - UInt64 rows_portion_size, - FormatFactory::ReadCallback callback, - const FormatSettings & settings) - { - return std::make_shared( - std::make_shared(buf, sample, true, false, settings), - sample, max_block_size, rows_portion_size, callback, settings); - }); - } - - for (auto name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"}) - { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - const Context &, - UInt64 max_block_size, - UInt64 rows_portion_size, - FormatFactory::ReadCallback callback, - const FormatSettings & settings) - { - return std::make_shared( - std::make_shared(buf, sample, true, true, settings), - sample, max_block_size, rows_portion_size, callback, settings); - }); - } -} - -} diff --git a/dbms/src/Formats/TabSeparatedRowInputStream.h b/dbms/src/Formats/TabSeparatedRowInputStream.h deleted file mode 100644 index 0b296e1f1e4..00000000000 --- a/dbms/src/Formats/TabSeparatedRowInputStream.h +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once - -#include -#include - -#include -#include -#include - - -namespace DB -{ - -class ReadBuffer; - - -/** A stream to input data in tsv format. - */ -class TabSeparatedRowInputStream : public RowInputStreamWithDiagnosticInfo -{ -public: - /** with_names - the first line is the header with the names of the columns - * with_types - on the next line header with type names - */ - TabSeparatedRowInputStream( - ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_, const FormatSettings & format_settings_); - - bool read(MutableColumns & columns, RowReadExtension & ext) override; - void readPrefix() override; - bool allowSyncAfterError() const override { return true; } - void syncAfterError() override; - -private: - bool with_names; - bool with_types; - const FormatSettings format_settings; - DataTypes data_types; - - using IndexesMap = std::unordered_map; - IndexesMap column_indexes_by_names; - - using OptionalIndexes = std::vector>; - OptionalIndexes column_indexes_for_input_fields; - - std::vector read_columns; - std::vector columns_to_fill_with_default_values; - - void addInputColumn(const String & column_name); - void setupAllColumnsByTableSchema(); - void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext); - - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) override; - bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } -}; - -} diff --git a/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 2b8c1be6016..440487e758a 100644 --- a/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -19,7 +20,7 @@ namespace ErrorCodes CSVRowInputFormat::CSVRowInputFormat( ReadBuffer & in_, Block header_, Params params_, bool with_names_, const FormatSettings & format_settings_) - : IRowInputFormat(std::move(header_), in_, std::move(params_)) + : RowInputFormatWithDiagnosticInfo(header_, in_, params_) , with_names(with_names_) , format_settings(format_settings_) { @@ -79,72 +80,72 @@ void CSVRowInputFormat::addInputColumn(const String & column_name) column_indexes_for_input_fields.emplace_back(column_index); } -static void skipEndOfLine(ReadBuffer & istr) +static void skipEndOfLine(ReadBuffer & in) { /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) - if (*istr.position() == '\n') + if (*in.position() == '\n') { - ++istr.position(); - if (!istr.eof() && *istr.position() == '\r') - ++istr.position(); + ++in.position(); + if (!in.eof() && *in.position() == '\r') + ++in.position(); } - else if (*istr.position() == '\r') + else if (*in.position() == '\r') { - ++istr.position(); - if (!istr.eof() && *istr.position() == '\n') - ++istr.position(); + ++in.position(); + if (!in.eof() && *in.position() == '\n') + ++in.position(); else throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA); } - else if (!istr.eof()) + else if (!in.eof()) throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA); } -static void skipDelimiter(ReadBuffer & istr, const char delimiter, bool is_last_column) +static void skipDelimiter(ReadBuffer & in, const char delimiter, bool is_last_column) { if (is_last_column) { - if (istr.eof()) + if (in.eof()) return; /// we support the extra delimiter at the end of the line - if (*istr.position() == delimiter) + if (*in.position() == delimiter) { - ++istr.position(); - if (istr.eof()) + ++in.position(); + if (in.eof()) return; } - skipEndOfLine(istr); + skipEndOfLine(in); } else - assertChar(delimiter, istr); + assertChar(delimiter, in); } /// Skip `whitespace` symbols allowed in CSV. -static inline void skipWhitespacesAndTabs(ReadBuffer & buf) +static inline void skipWhitespacesAndTabs(ReadBuffer & in) { - while (!buf.eof() - && (*buf.position() == ' ' - || *buf.position() == '\t')) - ++buf.position(); + while (!in.eof() + && (*in.position() == ' ' + || *in.position() == '\t')) + ++in.position(); } -static void skipRow(ReadBuffer & istr, const FormatSettings::CSV & settings, size_t num_columns) +static void skipRow(ReadBuffer & in, const FormatSettings::CSV & settings, size_t num_columns) { String tmp; for (size_t i = 0; i < num_columns; ++i) { - skipWhitespacesAndTabs(istr); - readCSVString(tmp, istr, settings); - skipWhitespacesAndTabs(istr); + skipWhitespacesAndTabs(in); + readCSVString(tmp, in, settings); + skipWhitespacesAndTabs(in); - skipDelimiter(istr, settings.delimiter, i + 1 == num_columns); + skipDelimiter(in, settings.delimiter, i + 1 == num_columns); } } @@ -156,7 +157,6 @@ void CSVRowInputFormat::readPrefix() skipBOMIfExists(in); size_t num_columns = data_types.size(); - String tmp; auto & header = getPort().getHeader(); if (with_names) @@ -224,8 +224,7 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column) { const auto & table_column = column_indexes_for_input_fields[file_column]; - const bool is_last_file_column = - file_column + 1 == column_indexes_for_input_fields.size(); + const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size(); if (table_column) { @@ -267,71 +266,7 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext return true; } - -String CSVRowInputFormat::getDiagnosticInfo() -{ - if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed. - return {}; - - WriteBufferFromOwnString out; - - auto & header = getPort().getHeader(); - MutableColumns columns = header.cloneEmptyColumns(); - - /// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer. - size_t bytes_read_at_start_of_buffer = in.count() - in.offset(); - if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row) - { - out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n"; - return out.str(); - } - - size_t max_length_of_column_name = 0; - for (size_t i = 0; i < header.columns(); ++i) - if (header.safeGetByPosition(i).name.size() > max_length_of_column_name) - max_length_of_column_name = header.safeGetByPosition(i).name.size(); - - size_t max_length_of_data_type_name = 0; - for (size_t i = 0; i < header.columns(); ++i) - if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name) - max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size(); - - /// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information. - - if (pos_of_prev_row) - { - in.position() = pos_of_prev_row; - - out << "\nRow " << (row_num - 1) << ":\n"; - if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name)) - return out.str(); - } - else - { - if (!pos_of_current_row) - { - out << "Could not print diagnostic info because parsing of data hasn't started.\n"; - return out.str(); - } - - in.position() = pos_of_current_row; - } - - out << "\nRow " << row_num << ":\n"; - parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name); - out << "\n"; - - return out.str(); -} - -/** gcc-7 generates wrong code with optimization level greater than 1. - * See tests: dbms/src/IO/tests/write_int.cpp - * and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh - * This is compiler bug. The bug does not present in gcc-8 and clang-8. - * Nevertheless, we don't need high optimization of this function. - */ -bool OPTIMIZE(1) CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, - WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name) +bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { const char delimiter = format_settings.csv.delimiter; @@ -345,100 +280,19 @@ bool OPTIMIZE(1) CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumn if (column_indexes_for_input_fields[file_column].has_value()) { - const auto & table_column = *column_indexes_for_input_fields[file_column]; - const auto & current_column_type = data_types[table_column]; - const bool is_last_file_column = - file_column + 1 == column_indexes_for_input_fields.size(); - const bool at_delimiter = !in.eof() && *in.position() == delimiter; - const bool at_last_column_line_end = is_last_file_column - && (in.eof() || *in.position() == '\n' || *in.position() == '\r'); - auto & header = getPort().getHeader(); - out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ') - << "name: " << header.safeGetByPosition(table_column).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(table_column).name.size(), ' ') - << "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' '); - - if (format_settings.csv.empty_as_default - && (at_delimiter || at_last_column_line_end)) - { - columns[table_column]->insertDefault(); - } - else - { - BufferBase::Position prev_position = in.position(); - BufferBase::Position curr_position = in.position(); - std::exception_ptr exception; - - try - { - skipWhitespacesAndTabs(in); - prev_position = in.position(); - readField(*columns[table_column], current_column_type, is_last_file_column, table_column); - curr_position = in.position(); - skipWhitespacesAndTabs(in); - } - catch (...) - { - exception = std::current_exception(); - } - - if (curr_position < prev_position) - throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR); - - if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type)) - { - /// An empty string instead of a value. - if (curr_position == prev_position) - { - out << "ERROR: text "; - verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out); - out << " is not like " << current_column_type->getName() << "\n"; - return false; - } - } - - out << "parsed text: "; - verbosePrintString(prev_position, curr_position, out); - - if (exception) - { - if (current_column_type->getName() == "DateTime") - out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; - else if (current_column_type->getName() == "Date") - out << "ERROR: Date must be in YYYY-MM-DD format.\n"; - else - out << "ERROR\n"; - return false; - } - - out << "\n"; - - if (current_column_type->haveMaximumSizeOfValue() - && *curr_position != '\n' && *curr_position != '\r' - && *curr_position != delimiter) - { - out << "ERROR: garbage after " << current_column_type->getName() << ": "; - verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out); - out << "\n"; - - if (current_column_type->getName() == "DateTime") - out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; - else if (current_column_type->getName() == "Date") - out << "ERROR: Date must be in YYYY-MM-DD format.\n"; - - return false; - } - } + size_t col_idx = column_indexes_for_input_fields[file_column].value(); + if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], + out, file_column)) + return false; } else { static const String skipped_column_str = ""; - out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ') - << "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ') - << "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' '); - - String tmp; - readCSVString(tmp, in, format_settings.csv); + static const DataTypePtr skipped_column_type = std::make_shared(); + static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); + if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) + return false; } /// Delimiters @@ -502,15 +356,26 @@ void CSVRowInputFormat::syncAfterError() skipToNextLineOrEOF(in); } -void CSVRowInputFormat::updateDiagnosticInfo() +void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) { - ++row_num; + skipWhitespacesAndTabs(in); + prev_pos = in.position(); - bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row; - bytes_read_at_start_of_buffer_on_current_row = in.count() - in.offset(); + if (column_indexes_for_input_fields[input_position]) + { + const bool is_last_file_column = input_position + 1 == column_indexes_for_input_fields.size(); + if (!readField(column, type, is_last_file_column, *column_indexes_for_input_fields[input_position])) + column.insertDefault(); + } + else + { + String tmp; + readCSVString(tmp, in, format_settings.csv); + } - pos_of_prev_row = pos_of_current_row; - pos_of_current_row = in.position(); + curr_pos = in.position(); + skipWhitespacesAndTabs(in); } bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx) diff --git a/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.h b/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.h index 59b24ae0140..6ca40425ebd 100644 --- a/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -1,7 +1,10 @@ #pragma once +#include +#include + #include -#include +#include #include @@ -13,28 +16,24 @@ class ReadBuffer; /** A stream for inputting data in csv format. * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. */ -class CSVRowInputFormat : public IRowInputFormat +class CSVRowInputFormat : public RowInputFormatWithDiagnosticInfo { public: /** with_names - in the first line the header with column names - * with_types - on the next line header with type names */ CSVRowInputFormat(ReadBuffer & in_, Block header_, Params params_, bool with_names_, const FormatSettings & format_settings_); String getName() const override { return "CSVRowInputFormat"; } - bool readRow(MutableColumns & columns, RowReadExtension &) override; + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; void readPrefix() override; bool allowSyncAfterError() const override { return true; } void syncAfterError() override; - std::string getDiagnosticInfo() override; - private: bool with_names; - DataTypes data_types; - const FormatSettings format_settings; + DataTypes data_types; using IndexesMap = std::unordered_map; IndexesMap column_indexes_by_names; @@ -55,26 +54,19 @@ private: void addInputColumn(const String & column_name); - /// For convenient diagnostics in case of an error. - size_t row_num = 0; - - /// How many bytes were read, not counting those that are still in the buffer. - size_t bytes_read_at_start_of_buffer_on_current_row = 0; - size_t bytes_read_at_start_of_buffer_on_prev_row = 0; - - char * pos_of_current_row = nullptr; - char * pos_of_prev_row = nullptr; + bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; + void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) override; + bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override + { + return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter; + } /// For setting input_format_null_as_default DataTypes nullable_types; MutableColumns nullable_columns; OptionalIndexes column_idx_to_nullable_column_idx; - void updateDiagnosticInfo(); - - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, - WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name); - bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx); }; diff --git a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 39c06c9441b..602b29e08c5 100644 --- a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -16,23 +17,23 @@ namespace ErrorCodes } -static void skipTSVRow(ReadBuffer & istr, const size_t num_columns) +static void skipTSVRow(ReadBuffer & in, const size_t num_columns) { NullSink null_sink; for (size_t i = 0; i < num_columns; ++i) { - readEscapedStringInto(null_sink, istr); - assertChar(i == num_columns - 1 ? '\n' : '\t', istr); + readEscapedStringInto(null_sink, in); + assertChar(i == num_columns - 1 ? '\n' : '\t', in); } } /** Check for a common error case - usage of Windows line feed. */ -static void checkForCarriageReturn(ReadBuffer & istr) +static void checkForCarriageReturn(ReadBuffer & in) { - if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r')) + if (in.position()[0] == '\r' || (in.position() != in.buffer().begin() && in.position()[-1] == '\r')) throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row." "\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format." " You must transform your file to Unix format." @@ -43,7 +44,7 @@ static void checkForCarriageReturn(ReadBuffer & istr) TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( ReadBuffer & in_, Block header_, bool with_names_, bool with_types_, Params params_, const FormatSettings & format_settings_) - : IRowInputFormat(std::move(header_), in_, std::move(params_)), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) + : RowInputFormatWithDiagnosticInfo(std::move(header_), in_, std::move(params_)), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) { auto & sample = getPort().getHeader(); size_t num_columns = sample.columns(); @@ -205,65 +206,7 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens return true; } - -String TabSeparatedRowInputFormat::getDiagnosticInfo() -{ - if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed. - return {}; - - auto & header = getPort().getHeader(); - WriteBufferFromOwnString out; - MutableColumns columns = header.cloneEmptyColumns(); - - /// It is possible to display detailed diagnostics only if the last and next to last lines are still in the read buffer. - size_t bytes_read_at_start_of_buffer = in.count() - in.offset(); - if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row) - { - out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n"; - return out.str(); - } - - size_t max_length_of_column_name = 0; - for (size_t i = 0; i < header.columns(); ++i) - if (header.safeGetByPosition(i).name.size() > max_length_of_column_name) - max_length_of_column_name = header.safeGetByPosition(i).name.size(); - - size_t max_length_of_data_type_name = 0; - for (size_t i = 0; i < header.columns(); ++i) - if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name) - max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size(); - - /// Roll back the cursor to the beginning of the previous or current line and pars all over again. But now we derive detailed information. - - if (pos_of_prev_row) - { - in.position() = pos_of_prev_row; - - out << "\nRow " << (row_num - 1) << ":\n"; - if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name)) - return out.str(); - } - else - { - if (!pos_of_current_row) - { - out << "Could not print diagnostic info because parsing of data hasn't started.\n"; - return out.str(); - } - - in.position() = pos_of_current_row; - } - - out << "\nRow " << row_num << ":\n"; - parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name); - out << "\n"; - - return out.str(); -} - - -bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, - WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name) +bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position) { @@ -275,86 +218,19 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & if (column_indexes_for_input_fields[input_position].has_value()) { - const auto & column_index = *column_indexes_for_input_fields[input_position]; - const auto & current_column_type = data_types[column_index]; - - const auto & header = getPort().getHeader(); - - out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ') - << "name: " << header.safeGetByPosition(column_index).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(column_index).name.size(), ' ') - << "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' '); - - auto prev_position = in.position(); - std::exception_ptr exception; - - try - { - current_column_type->deserializeAsTextEscaped(*columns[column_index], in, format_settings); - } - catch (...) - { - exception = std::current_exception(); - } - - auto curr_position = in.position(); - - if (curr_position < prev_position) - throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR); - - if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type)) - { - /// An empty string instead of a value. - if (curr_position == prev_position) - { - out << "ERROR: text "; - verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out); - out << " is not like " << current_column_type->getName() << "\n"; - return false; - } - } - - out << "parsed text: "; - verbosePrintString(prev_position, curr_position, out); - - if (exception) - { - if (current_column_type->getName() == "DateTime") - out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; - else if (current_column_type->getName() == "Date") - out << "ERROR: Date must be in YYYY-MM-DD format.\n"; - else - out << "ERROR\n"; + auto & header = getPort().getHeader(); + size_t col_idx = column_indexes_for_input_fields[input_position].value(); + if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], + out, input_position)) return false; - } - - out << "\n"; - - if (current_column_type->haveMaximumSizeOfValue()) - { - if (*curr_position != '\n' && *curr_position != '\t') - { - out << "ERROR: garbage after " << current_column_type->getName() << ": "; - verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out); - out << "\n"; - - if (current_column_type->getName() == "DateTime") - out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; - else if (current_column_type->getName() == "Date") - out << "ERROR: Date must be in YYYY-MM-DD format.\n"; - - return false; - } - } } else { static const String skipped_column_str = ""; - out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ') - << "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ') - << "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' '); - - NullSink null_sink; - readEscapedStringInto(null_sink, in); + static const DataTypePtr skipped_column_type = std::make_shared(); + static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); + if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, input_position)) + return false; } /// Delimiters @@ -421,6 +297,20 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & return true; } +void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, + ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) +{ + prev_pos = in.position(); + if (column_indexes_for_input_fields[input_position]) + type->deserializeAsTextEscaped(column, in, format_settings); + else + { + NullSink null_sink; + readEscapedStringInto(null_sink, in); + } + curr_pos = in.position(); +} void TabSeparatedRowInputFormat::syncAfterError() { @@ -428,18 +318,6 @@ void TabSeparatedRowInputFormat::syncAfterError() } -void TabSeparatedRowInputFormat::updateDiagnosticInfo() -{ - ++row_num; - - bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row; - bytes_read_at_start_of_buffer_on_current_row = in.count() - in.offset(); - - pos_of_prev_row = pos_of_current_row; - pos_of_current_row = in.position(); -} - - void registerInputFormatProcessorTabSeparated(FormatFactory & factory) { for (auto name : {"TabSeparated", "TSV"}) diff --git a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 076cbb60152..222dcfce473 100644 --- a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB @@ -13,7 +13,7 @@ class ReadBuffer; /** A stream to input data in tsv format. */ -class TabSeparatedRowInputFormat : public IRowInputFormat +class TabSeparatedRowInputFormat : public RowInputFormatWithDiagnosticInfo { public: /** with_names - the first line is the header with the names of the columns @@ -29,8 +29,6 @@ public: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; - std::string getDiagnosticInfo() override; - private: bool with_names; bool with_types; @@ -50,21 +48,10 @@ private: void setupAllColumnsByTableSchema(); void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext); - /// For convenient diagnostics in case of an error. - - size_t row_num = 0; - - /// How many bytes were read, not counting those still in the buffer. - size_t bytes_read_at_start_of_buffer_on_current_row = 0; - size_t bytes_read_at_start_of_buffer_on_prev_row = 0; - - char * pos_of_current_row = nullptr; - char * pos_of_prev_row = nullptr; - - void updateDiagnosticInfo(); - - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, - WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name); + bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; + void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, + ReadBuffer::Position & curr_pos) override; + bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } }; } diff --git a/dbms/src/Formats/TemplateBlockOutputStream.cpp b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp similarity index 82% rename from dbms/src/Formats/TemplateBlockOutputStream.cpp rename to dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 6b4243b1736..1e1854d7663 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include @@ -134,9 +134,15 @@ String ParsedTemplateFormat::formatToString(ParsedTemplateFormat::ColumnFormat f } -TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_) - : ostr(ostr_), header(sample), settings(settings_) +TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & settings_) + : IOutputFormat(header_, out_), settings(settings_) { + auto & sample = getPort(PortKind::Main).getHeader(); + size_t columns = sample.columns(); + types.resize(columns); + for (size_t i = 0; i < columns; ++i) + types[i] = sample.safeGetByPosition(i).type; + static const String default_format("${data}"); const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; format = ParsedTemplateFormat(format_str, [&](const String & partName) @@ -169,14 +175,14 @@ TemplateBlockOutputStream::TemplateBlockOutputStream(WriteBuffer & ostr_, const row_format = ParsedTemplateFormat(settings.template_settings.row_format, [&](const String & colName) { - return header.getPositionByName(colName); + return sample.getPositionByName(colName); }); if (row_format.delimiters.size() == 1) throw Exception("invalid template: no columns specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); } -TemplateBlockOutputStream::OutputPart TemplateBlockOutputStream::stringToOutputPart(const String & part) +TemplateBlockOutputFormat::OutputPart TemplateBlockOutputFormat::stringToOutputPart(const String & part) { if (part == "data") return OutputPart::Data; @@ -200,52 +206,46 @@ TemplateBlockOutputStream::OutputPart TemplateBlockOutputStream::stringToOutputP throw Exception("invalid template: unknown output part " + part, ErrorCodes::INVALID_TEMPLATE_FORMAT); } -void TemplateBlockOutputStream::flush() -{ - ostr.next(); -} - -void TemplateBlockOutputStream::writeRow(const Block & block, size_t row_num) +void TemplateBlockOutputFormat::writeRow(const Chunk & chunk, size_t row_num) { size_t columns = row_format.format_idx_to_column_idx.size(); for (size_t j = 0; j < columns; ++j) { - writeString(row_format.delimiters[j], ostr); + writeString(row_format.delimiters[j], out); size_t col_idx = row_format.format_idx_to_column_idx[j]; - const ColumnWithTypeAndName & col = block.getByPosition(col_idx); - serializeField(*col.column, *col.type, row_num, row_format.formats[j]); + serializeField(*chunk.getColumns()[col_idx], *types[col_idx], row_num, row_format.formats[j]); } - writeString(row_format.delimiters[columns], ostr); + writeString(row_format.delimiters[columns], out); } -void TemplateBlockOutputStream::serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat col_format) +void TemplateBlockOutputFormat::serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat col_format) { switch (col_format) { case ColumnFormat::Default: case ColumnFormat::Escaped: - type.serializeAsTextEscaped(column, row_num, ostr, settings); + type.serializeAsTextEscaped(column, row_num, out, settings); break; case ColumnFormat::Quoted: - type.serializeAsTextQuoted(column, row_num, ostr, settings); + type.serializeAsTextQuoted(column, row_num, out, settings); break; case ColumnFormat::Csv: - type.serializeAsTextCSV(column, row_num, ostr, settings); + type.serializeAsTextCSV(column, row_num, out, settings); break; case ColumnFormat::Json: - type.serializeAsTextJSON(column, row_num, ostr, settings); + type.serializeAsTextJSON(column, row_num, out, settings); break; case ColumnFormat::Xml: - type.serializeAsTextXML(column, row_num, ostr, settings); + type.serializeAsTextXML(column, row_num, out, settings); break; case ColumnFormat::Raw: - type.serializeAsText(column, row_num, ostr, settings); + type.serializeAsText(column, row_num, out, settings); break; } } -template void TemplateBlockOutputStream::writeValue(U value, ColumnFormat col_format) +template void TemplateBlockOutputFormat::writeValue(U value, ColumnFormat col_format) { auto type = std::make_unique(); auto col = type->createColumn(); @@ -253,27 +253,37 @@ template void TemplateBlockOutputStream::writeValue(U v serializeField(*col, *type, 0, col_format); } -void TemplateBlockOutputStream::write(const Block & block) +void TemplateBlockOutputFormat::consume(Chunk chunk) { - size_t rows = block.rows(); + doWritePrefix(); + + size_t rows = chunk.getNumRows(); for (size_t i = 0; i < rows; ++i) { if (row_count) - writeString(settings.template_settings.row_between_delimiter, ostr); + writeString(settings.template_settings.row_between_delimiter, out); - writeRow(block, i); + writeRow(chunk, i); ++row_count; } } -void TemplateBlockOutputStream::writePrefix() +void TemplateBlockOutputFormat::doWritePrefix() { - writeString(format.delimiters.front(), ostr); + if (need_write_prefix) + { + writeString(format.delimiters.front(), out); + need_write_prefix = false; + } } -void TemplateBlockOutputStream::writeSuffix() +void TemplateBlockOutputFormat::finalize() { + if (finalized) + return; + + doWritePrefix(); size_t parts = format.format_idx_to_column_idx.size(); @@ -318,22 +328,23 @@ void TemplateBlockOutputStream::writeSuffix() default: break; } - writeString(format.delimiters[j + 1], ostr); + writeString(format.delimiters[j + 1], out); } + finalized = true; } -void registerOutputFormatTemplate(FormatFactory & factory) +void registerOutputFormatProcessorTemplate(FormatFactory & factory) { - factory.registerOutputFormat("Template", []( + factory.registerOutputFormatProcessor("Template", []( WriteBuffer & buf, const Block & sample, const Context &, FormatFactory::WriteCallback, const FormatSettings & settings) { - return std::make_shared(buf, sample, settings); + return std::make_shared(buf, sample, settings); }); } } diff --git a/dbms/src/Formats/TemplateBlockOutputStream.h b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h similarity index 71% rename from dbms/src/Formats/TemplateBlockOutputStream.h rename to dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h index 971e4c98c9a..1e03ef3cc29 100644 --- a/dbms/src/Formats/TemplateBlockOutputStream.h +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h @@ -1,10 +1,9 @@ #pragma once +#include #include #include -#include -#include -#include +#include namespace DB @@ -35,25 +34,25 @@ struct ParsedTemplateFormat size_t columnsCount() const; }; -class TemplateBlockOutputStream : public IBlockOutputStream +class TemplateBlockOutputFormat : public IOutputFormat { using ColumnFormat = ParsedTemplateFormat::ColumnFormat; public: - TemplateBlockOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings_); - Block getHeader() const override { return header; } + TemplateBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & settings_); - void write(const Block & block) override; - void writePrefix() override; - void writeSuffix() override; + String getName() const override { return "TemplateBlockOutputFormat"; } - void flush() override; + void doWritePrefix() override; void setRowsBeforeLimit(size_t rows_before_limit_) override { rows_before_limit = rows_before_limit_; rows_before_limit_set = true; } - void setTotals(const Block & totals_) override { totals = totals_; } - void setExtremes(const Block & extremes_) override { extremes = extremes_; } void onProgress(const Progress & progress_) override { progress.incrementPiecewiseAtomically(progress_); } -private: +protected: + void consume(Chunk chunk) override; + void consumeTotals(Chunk chunk) override { totals = std::move(chunk); } + void consumeExtremes(Chunk chunk) override { extremes = std::move(chunk); } + void finalize() override; + enum class OutputPart : size_t { Data, @@ -68,26 +67,26 @@ private: }; OutputPart stringToOutputPart(const String & part); - void writeRow(const Block & block, size_t row_num); + void writeRow(const Chunk & chunk, size_t row_num); void serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat format); template void writeValue(U value, ColumnFormat col_format); -private: - WriteBuffer & ostr; - Block header; +protected: const FormatSettings settings; + DataTypes types; ParsedTemplateFormat format; ParsedTemplateFormat row_format; size_t rows_before_limit = 0; bool rows_before_limit_set = false; - Block totals; - Block extremes; + Chunk totals; + Chunk extremes; Progress progress; Stopwatch watch; size_t row_count = 0; + bool need_write_prefix = true; }; } diff --git a/dbms/src/Formats/TemplateRowInputStream.cpp b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp similarity index 82% rename from dbms/src/Formats/TemplateRowInputStream.cpp rename to dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 771e6238c66..f79f4290529 100644 --- a/dbms/src/Formats/TemplateRowInputStream.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -1,6 +1,5 @@ -#include +#include #include -#include #include #include @@ -15,9 +14,9 @@ extern const int CANNOT_READ_ALL_DATA; } -TemplateRowInputStream::TemplateRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & settings_, - bool ignore_spaces_) - : RowInputStreamWithDiagnosticInfo(buf, header_), buf(istr_), data_types(header.getDataTypes()), +TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & header_, const Params & params_, + const FormatSettings & settings_, bool ignore_spaces_) + : RowInputFormatWithDiagnosticInfo(header_, in_, params_), buf(in_), data_types(header_.getDataTypes()), settings(settings_), ignore_spaces(ignore_spaces_) { static const String default_format("${data}"); @@ -35,15 +34,15 @@ TemplateRowInputStream::TemplateRowInputStream(ReadBuffer & istr_, const Block & row_format = ParsedTemplateFormat(settings.template_settings.row_format, [&](const String & colName) { - return header.getPositionByName(colName); + return header_.getPositionByName(colName); }); - std::vector column_in_format(header.columns(), false); + std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { size_t col_idx = row_format.format_idx_to_column_idx[i]; if (column_in_format[col_idx]) - throw Exception("invalid template format: duplicate column " + header.getColumnsWithTypeAndName()[col_idx].name, + throw Exception("invalid template format: duplicate column " + header_.getColumnsWithTypeAndName()[col_idx].name, ErrorCodes::INVALID_TEMPLATE_FORMAT); column_in_format[col_idx] = true; @@ -52,13 +51,13 @@ TemplateRowInputStream::TemplateRowInputStream(ReadBuffer & istr_, const Block & } } -void TemplateRowInputStream::readPrefix() +void TemplateRowInputFormat::readPrefix() { skipSpaces(); assertString(format.delimiters.front(), buf); } -bool TemplateRowInputStream::read(MutableColumns & columns, RowReadExtension & extra) +bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra) { skipSpaces(); @@ -87,12 +86,12 @@ bool TemplateRowInputStream::read(MutableColumns & columns, RowReadExtension & e for (size_t i = 0; i < columns.size(); ++i) if (!extra.read_columns[i]) - header.getByPosition(i).type->insertDefaultInto(*columns[i]); + data_types[row_format.format_idx_to_column_idx[i]]->insertDefaultInto(*columns[i]); return true; } -void TemplateRowInputStream::deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format) +void TemplateRowInputFormat::deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format) { try { @@ -125,7 +124,7 @@ void TemplateRowInputStream::deserializeField(const IDataType & type, IColumn & /// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignnore_spaces == true) before EOF. /// Otherwise returns false -bool TemplateRowInputStream::checkForSuffix() +bool TemplateRowInputFormat::checkForSuffix() { if (unlikely(synced_after_error_at_last_row)) return true; @@ -148,7 +147,7 @@ bool TemplateRowInputStream::checkForSuffix() /// Returns true if buffer contains only suffix and maybe some spaces after it /// If there are not enough data in buffer, compares available data and removes it from reference to suffix -bool TemplateRowInputStream::compareSuffixPart(StringRef & suffix, BufferBase::Position pos, size_t available) +bool TemplateRowInputFormat::compareSuffixPart(StringRef & suffix, BufferBase::Position pos, size_t available) { if (suffix.size < available) { @@ -173,7 +172,7 @@ bool TemplateRowInputStream::compareSuffixPart(StringRef & suffix, BufferBase::P return true; } -bool TemplateRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) +bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { try { @@ -200,6 +199,7 @@ bool TemplateRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & col } skipSpaces(); + auto & header = getPort().getHeader(); size_t col_idx = row_format.format_idx_to_column_idx[i]; if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], out, i)) { @@ -223,7 +223,7 @@ bool TemplateRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & col return true; } -void TemplateRowInputStream::writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim) +void TemplateRowInputFormat::writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim) { out << "ERROR: There is no " << description << ": expected "; verbosePrintString(delim.data(), delim.data() + delim.size(), out); @@ -235,7 +235,7 @@ void TemplateRowInputStream::writeErrorStringForWrongDelimiter(WriteBuffer & out out << '\n'; } -void TemplateRowInputStream::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, +void TemplateRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) { prev_pos = buf.position(); @@ -243,18 +243,18 @@ void TemplateRowInputStream::tryDeserializeFiled(const DataTypePtr & type, IColu curr_pos = buf.position(); } -bool TemplateRowInputStream::isGarbageAfterField(size_t, ReadBuffer::Position) +bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position) { /// Garbage will be considered as wrong delimiter return false; } -bool TemplateRowInputStream::allowSyncAfterError() const +bool TemplateRowInputFormat::allowSyncAfterError() const { return !row_format.delimiters.back().empty() || !settings.template_settings.row_between_delimiter.empty(); } -void TemplateRowInputStream::syncAfterError() +void TemplateRowInputFormat::syncAfterError() { skipToNextDelimiterOrEof(row_format.delimiters.back()); if (buf.eof()) @@ -274,7 +274,7 @@ void TemplateRowInputStream::syncAfterError() } /// Searches for delimiter in input stream and sets buffer position to the beginning of delimiter (if found) or EOF (if not) -void TemplateRowInputStream::skipToNextDelimiterOrEof(const String & delimiter) +void TemplateRowInputFormat::skipToNextDelimiterOrEof(const String & delimiter) { StringRef delim(delimiter); if (!delim.size) return; @@ -301,7 +301,7 @@ void TemplateRowInputStream::skipToNextDelimiterOrEof(const String & delimiter) } } -void TemplateRowInputStream::throwUnexpectedEof() +void TemplateRowInputFormat::throwUnexpectedEof() { throw Exception("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", @@ -309,22 +309,18 @@ void TemplateRowInputStream::throwUnexpectedEof() } -void registerInputFormatTemplate(FormatFactory & factory) +void registerInputFormatProcessorTemplate(FormatFactory & factory) { for (bool ignore_spaces : {false, true}) { - factory.registerInputFormat(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [=]( + factory.registerInputFormatProcessor(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [=]( ReadBuffer & buf, const Block & sample, const Context &, - UInt64 max_block_size, - UInt64 rows_portion_size, - FormatFactory::ReadCallback callback, + IRowInputFormat::Params params, const FormatSettings & settings) { - return std::make_shared( - std::make_shared(buf, sample, settings, ignore_spaces), - sample, max_block_size, rows_portion_size, callback, settings); + return std::make_shared(buf, sample, std::move(params), settings, ignore_spaces); }); } } diff --git a/dbms/src/Formats/TemplateRowInputStream.h b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h similarity index 71% rename from dbms/src/Formats/TemplateRowInputStream.h rename to dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h index c13251a52f5..8381da58206 100644 --- a/dbms/src/Formats/TemplateRowInputStream.h +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -1,9 +1,9 @@ #pragma once #include -#include +#include #include -#include +#include #include #include @@ -11,13 +11,16 @@ namespace DB { -class TemplateRowInputStream : public RowInputStreamWithDiagnosticInfo +class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo { using ColumnFormat = ParsedTemplateFormat::ColumnFormat; public: - TemplateRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & settings_, bool ignore_spaces_); + TemplateRowInputFormat(ReadBuffer & in_, const Block & header_, const Params & params_, + const FormatSettings & settings_, bool ignore_spaces_); - bool read(MutableColumns & columns, RowReadExtension & extra) override; + String getName() const override { return "TemplateRowInputFormat"; } + + bool readRow(MutableColumns & columns, RowReadExtension & extra) override; void readPrefix() override; @@ -38,7 +41,7 @@ private: bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override; void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim); - void skipToNextDelimiterOrEof(const String& delimeter); + void skipToNextDelimiterOrEof(const String & delimiter); private: PeekableReadBuffer buf; diff --git a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp similarity index 79% rename from dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp rename to dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp index 45bce165019..4458f7f52e0 100644 --- a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.cpp +++ b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include @@ -12,33 +12,34 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -DB::RowInputStreamWithDiagnosticInfo::RowInputStreamWithDiagnosticInfo(ReadBuffer & istr_, const Block & header_) - : istr(istr_), header(header_) +DB::RowInputFormatWithDiagnosticInfo::RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_) + : IRowInputFormat(header_, in_, params_) { } -void DB::RowInputStreamWithDiagnosticInfo::updateDiagnosticInfo() +void DB::RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo() { ++row_num; bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row; - bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset(); + bytes_read_at_start_of_buffer_on_current_row = in.count() - in.offset(); offset_of_prev_row = offset_of_current_row; - offset_of_current_row = istr.offset(); + offset_of_current_row = in.offset(); } -String DB::RowInputStreamWithDiagnosticInfo::getDiagnosticInfo() +String DB::RowInputFormatWithDiagnosticInfo::getDiagnosticInfo() { - if (istr.eof()) /// Buffer has gone, cannot extract information about what has been parsed. + if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed. return {}; WriteBufferFromOwnString out; + auto & header = getPort().getHeader(); MutableColumns columns = header.cloneEmptyColumns(); /// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer. - size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset(); + size_t bytes_read_at_start_of_buffer = in.count() - in.offset(); if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row) { out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n"; @@ -57,9 +58,9 @@ String DB::RowInputStreamWithDiagnosticInfo::getDiagnosticInfo() /// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information. - if (offset_of_prev_row <= istr.buffer().size()) + if (offset_of_prev_row <= in.buffer().size()) { - istr.position() = istr.buffer().begin() + offset_of_prev_row; + in.position() = in.buffer().begin() + offset_of_prev_row; out << "\nRow " << (row_num - 1) << ":\n"; if (!parseRowAndPrintDiagnosticInfo(columns, out)) @@ -67,13 +68,13 @@ String DB::RowInputStreamWithDiagnosticInfo::getDiagnosticInfo() } else { - if (istr.buffer().size() < offset_of_current_row) + if (in.buffer().size() < offset_of_current_row) { out << "Could not print diagnostic info because parsing of data hasn't started.\n"; return out.str(); } - istr.position() = istr.buffer().begin() + offset_of_current_row; + in.position() = in.buffer().begin() + offset_of_current_row; } out << "\nRow " << row_num << ":\n"; @@ -83,7 +84,7 @@ String DB::RowInputStreamWithDiagnosticInfo::getDiagnosticInfo() return out.str(); } -bool RowInputStreamWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, +bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, IColumn & column, WriteBuffer & out, size_t input_position) @@ -92,8 +93,8 @@ bool RowInputStreamWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co << "name: " << alignedName(col_name, max_length_of_column_name) << "type: " << alignedName(type->getName(), max_length_of_data_type_name); - auto prev_position = istr.position(); - auto curr_position = istr.position(); + auto prev_position = in.position(); + auto curr_position = in.position(); std::exception_ptr exception; try @@ -114,7 +115,7 @@ bool RowInputStreamWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co if (curr_position == prev_position) { out << "ERROR: text "; - verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out); + verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out); out << " is not like " << type->getName() << "\n"; return false; } @@ -141,7 +142,7 @@ bool RowInputStreamWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co if (isGarbageAfterField(input_position, curr_position)) { out << "ERROR: garbage after " << type->getName() << ": "; - verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out); + verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out); out << "\n"; if (type->getName() == "DateTime") @@ -156,7 +157,7 @@ bool RowInputStreamWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co return true; } -String RowInputStreamWithDiagnosticInfo::alignedName(const String & name, size_t max_length) const +String RowInputFormatWithDiagnosticInfo::alignedName(const String & name, size_t max_length) const { size_t spaces_count = max_length >= name.size() ? max_length - name.size() : 0; return name + ", " + std::string(spaces_count, ' '); diff --git a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h similarity index 84% rename from dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h rename to dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h index e0fad00a9a6..f335908cecf 100644 --- a/dbms/src/Formats/RowInputStreamWithDiagnosticInfo.h +++ b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include @@ -9,10 +9,10 @@ namespace DB { -class RowInputStreamWithDiagnosticInfo : public IRowInputStream +class RowInputFormatWithDiagnosticInfo : public IRowInputFormat { public: - RowInputStreamWithDiagnosticInfo(ReadBuffer & istr_, const Block & header_); + RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_); String getDiagnosticInfo() override; @@ -27,8 +27,8 @@ protected: ReadBuffer::Position & curr_pos) = 0; virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0; - ReadBuffer & istr; - Block header; + //ReadBuffer & istr; + //Block header; /// For convenient diagnostics in case of an error. size_t row_num = 0; From f9445626d115ad4c8111fce82b410ae6f65119ee Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 26 Aug 2019 16:02:15 +0300 Subject: [PATCH 22/43] better parsing of format string --- .../Impl/TemplateBlockOutputFormat.cpp | 85 +++++++++++++------ .../Formats/Impl/TemplateBlockOutputFormat.h | 17 ++-- .../Formats/Impl/TemplateRowInputFormat.cpp | 6 +- .../Formats/Impl/TemplateRowInputFormat.h | 6 +- .../00937_template_output_format.sql | 4 +- 5 files changed, 77 insertions(+), 41 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 1e1854d7663..7ecada13d7d 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -1,8 +1,10 @@ #include #include #include +#include #include #include +#include namespace DB @@ -13,7 +15,7 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT; } -ParsedTemplateFormat::ParsedTemplateFormat(const String & format_string, const ColumnIdxGetter & idxByName) +ParsedTemplateFormatString::ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idxByName) { enum ParserState { @@ -21,8 +23,11 @@ ParsedTemplateFormat::ParsedTemplateFormat(const String & format_string, const C Column, Format }; + const char * pos = format_string.c_str(); + const char * end = format_string.c_str() + format_string.size(); const char * token_begin = pos; + String column_name; ParserState state = Delimiter; delimiters.emplace_back(); for (; *pos; ++pos) @@ -45,29 +50,29 @@ ParsedTemplateFormat::ParsedTemplateFormat(const String & format_string, const C } else { - throw Exception("invalid template: pos " + std::to_string(pos - format_string.c_str()) + + throw Exception("Invalid template format string: pos " + std::to_string(pos - format_string.c_str()) + ": expected '{' or '$' after '$'", ErrorCodes::INVALID_TEMPLATE_FORMAT); } } break; case Column: + pos = readMayBeQuotedColumnNameInto(pos, end - pos, column_name); + if (*pos == ':') - { - size_t column_idx = idxByName(String(token_begin, pos - token_begin)); - format_idx_to_column_idx.push_back(column_idx); - token_begin = pos + 1; state = Format; - } else if (*pos == '}') { - size_t column_idx = idxByName(String(token_begin, pos - token_begin)); - format_idx_to_column_idx.push_back(column_idx); formats.push_back(ColumnFormat::Default); delimiters.emplace_back(); - token_begin = pos + 1; state = Delimiter; } + else + throw Exception("Invalid template format string: Expected ':' or '}' after column name: \"" + column_name + "\"", + ErrorCodes::INVALID_TEMPLATE_FORMAT); + + token_begin = pos + 1; + format_idx_to_column_idx.emplace_back(idxByName(column_name)); break; case Format: @@ -81,12 +86,12 @@ ParsedTemplateFormat::ParsedTemplateFormat(const String & format_string, const C } } if (state != Delimiter) - throw Exception("invalid template: check parentheses balance", ErrorCodes::INVALID_TEMPLATE_FORMAT); + throw Exception("Invalid template format string: check parentheses balance", ErrorCodes::INVALID_TEMPLATE_FORMAT); delimiters.back().append(token_begin, pos - token_begin); } -ParsedTemplateFormat::ColumnFormat ParsedTemplateFormat::stringToFormat(const String & col_format) +ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format) { if (col_format.empty()) return ColumnFormat::Default; @@ -103,15 +108,16 @@ ParsedTemplateFormat::ColumnFormat ParsedTemplateFormat::stringToFormat(const St else if (col_format == "Raw") return ColumnFormat::Raw; else - throw Exception("invalid template: unknown field format " + col_format, ErrorCodes::INVALID_TEMPLATE_FORMAT); + throw Exception("Invalid template format string: unknown field format " + col_format, + ErrorCodes::INVALID_TEMPLATE_FORMAT); } -size_t ParsedTemplateFormat::columnsCount() const +size_t ParsedTemplateFormatString::columnsCount() const { return format_idx_to_column_idx.size(); } -String ParsedTemplateFormat::formatToString(ParsedTemplateFormat::ColumnFormat format) +String ParsedTemplateFormatString::formatToString(ParsedTemplateFormatString::ColumnFormat format) { switch (format) { @@ -133,6 +139,27 @@ String ParsedTemplateFormat::formatToString(ParsedTemplateFormat::ColumnFormat f __builtin_unreachable(); } +const char * ParsedTemplateFormatString::readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s) +{ + s.clear(); + if (!size) + return pos; + ReadBufferFromMemory buf{pos, size}; + if (*pos == '"') + readDoubleQuotedStringWithSQLStyle(s, buf); + else if (*pos == '`') + readBackQuotedStringWithSQLStyle(s, buf); + else if (isWordCharASCII(*pos)) + { + size_t name_size = 1; + while (name_size < size && isWordCharASCII(*(pos + name_size))) + ++name_size; + s = String{pos, name_size}; + return pos + name_size; + } + return pos + buf.count(); +} + TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & settings_) : IOutputFormat(header_, out_), settings(settings_) @@ -143,21 +170,25 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const B for (size_t i = 0; i < columns; ++i) types[i] = sample.safeGetByPosition(i).type; + /// Parse format string for whole output static const String default_format("${data}"); const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; - format = ParsedTemplateFormat(format_str, [&](const String & partName) + format = ParsedTemplateFormatString(format_str, [&](const String & partName) { return static_cast(stringToOutputPart(partName)); }); - size_t dataIdx = format.format_idx_to_column_idx.size() + 1; + /// Validate format string for whole output + size_t data_idx = format.format_idx_to_column_idx.size() + 1; for (size_t i = 0; i < format.format_idx_to_column_idx.size(); ++i) { - switch (static_cast(format.format_idx_to_column_idx[i])) + if (!format.format_idx_to_column_idx[i]) + throw Exception("Output part name cannot be empty, it's a bug.", ErrorCodes::LOGICAL_ERROR); + switch (static_cast(*format.format_idx_to_column_idx[i])) { case OutputPart::Data: - dataIdx = i; - BOOST_FALLTHROUGH; + data_idx = i; + [[fallthrough]]; case OutputPart::Totals: case OutputPart::ExtremesMin: case OutputPart::ExtremesMax: @@ -169,17 +200,21 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const B break; } } - - if (dataIdx != 0) + if (data_idx != 0) throw Exception("invalid template: ${data} must be the first output part", ErrorCodes::INVALID_TEMPLATE_FORMAT); - row_format = ParsedTemplateFormat(settings.template_settings.row_format, [&](const String & colName) + /// Parse format string for rows + row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) { return sample.getPositionByName(colName); }); + /// Validate format string for rows if (row_format.delimiters.size() == 1) throw Exception("invalid template: no columns specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); + for (const auto & idx_mapping : row_format.format_idx_to_column_idx) + if (!idx_mapping) + throw Exception("Cannot skip format field for output, it's a bug.", ErrorCodes::LOGICAL_ERROR); } TemplateBlockOutputFormat::OutputPart TemplateBlockOutputFormat::stringToOutputPart(const String & part) @@ -213,7 +248,7 @@ void TemplateBlockOutputFormat::writeRow(const Chunk & chunk, size_t row_num) { writeString(row_format.delimiters[j], out); - size_t col_idx = row_format.format_idx_to_column_idx[j]; + size_t col_idx = *row_format.format_idx_to_column_idx[j]; serializeField(*chunk.getColumns()[col_idx], *types[col_idx], row_num, row_format.formats[j]); } writeString(row_format.delimiters[columns], out); @@ -291,7 +326,7 @@ void TemplateBlockOutputFormat::finalize() { auto type = std::make_shared(); ColumnWithTypeAndName col(type->createColumnConst(1, row_count), type, String("tmp")); - switch (static_cast(format.format_idx_to_column_idx[j])) + switch (static_cast(*format.format_idx_to_column_idx[j])) { case OutputPart::Totals: if (!totals) diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h index 1e03ef3cc29..292e527ae16 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h @@ -9,7 +9,7 @@ namespace DB { -struct ParsedTemplateFormat +struct ParsedTemplateFormatString { enum class ColumnFormat { @@ -23,20 +23,21 @@ struct ParsedTemplateFormat }; std::vector delimiters; std::vector formats; - std::vector format_idx_to_column_idx; + std::vector> format_idx_to_column_idx; - typedef std::function ColumnIdxGetter; + typedef std::function(const String &)> ColumnIdxGetter; - ParsedTemplateFormat() = default; - ParsedTemplateFormat(const String & format_string, const ColumnIdxGetter & idxByName); + ParsedTemplateFormatString() = default; + ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idxByName); static ColumnFormat stringToFormat(const String & format); static String formatToString(ColumnFormat format); + static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s); size_t columnsCount() const; }; class TemplateBlockOutputFormat : public IOutputFormat { - using ColumnFormat = ParsedTemplateFormat::ColumnFormat; + using ColumnFormat = ParsedTemplateFormatString::ColumnFormat; public: TemplateBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & settings_); @@ -75,8 +76,8 @@ protected: const FormatSettings settings; DataTypes types; - ParsedTemplateFormat format; - ParsedTemplateFormat row_format; + ParsedTemplateFormatString format; + ParsedTemplateFormatString row_format; size_t rows_before_limit = 0; bool rows_before_limit_set = false; diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index f79f4290529..8cef30b4ae1 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -21,7 +21,7 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h { static const String default_format("${data}"); const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; - format = ParsedTemplateFormat(format_str, [&](const String & partName) + format = ParsedTemplateFormatString(format_str, [&](const String & partName) { if (partName == "data") return 0; @@ -32,7 +32,7 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h throw Exception("invalid template format: format_schema must be \"prefix ${data} suffix\"", ErrorCodes::INVALID_TEMPLATE_FORMAT); - row_format = ParsedTemplateFormat(settings.template_settings.row_format, [&](const String & colName) + row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) { return header_.getPositionByName(colName); }); @@ -204,7 +204,7 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], out, i)) { out << "Maybe it's not possible to deserialize field " + std::to_string(i) + - " as " + ParsedTemplateFormat::formatToString(row_format.formats[i]); + " as " + ParsedTemplateFormatString::formatToString(row_format.formats[i]); return false; } } diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 8381da58206..a33ea8bd3ed 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -13,7 +13,7 @@ namespace DB class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo { - using ColumnFormat = ParsedTemplateFormat::ColumnFormat; + using ColumnFormat = ParsedTemplateFormatString::ColumnFormat; public: TemplateRowInputFormat(ReadBuffer & in_, const Block & header_, const Params & params_, const FormatSettings & settings_, bool ignore_spaces_); @@ -48,8 +48,8 @@ private: DataTypes data_types; FormatSettings settings; - ParsedTemplateFormat format; - ParsedTemplateFormat row_format; + ParsedTemplateFormatString format; + ParsedTemplateFormatString row_format; const bool ignore_spaces; bool synced_after_error_at_last_row = false; }; diff --git a/dbms/tests/queries/0_stateless/00937_template_output_format.sql b/dbms/tests/queries/0_stateless/00937_template_output_format.sql index 9aed990149a..119b6f61717 100644 --- a/dbms/tests/queries/0_stateless/00937_template_output_format.sql +++ b/dbms/tests/queries/0_stateless/00937_template_output_format.sql @@ -1,12 +1,12 @@ DROP TABLE IF EXISTS test.template; -CREATE TABLE test.template (s1 String, s2 String, s3 String, s4 String, n UInt64, d Date) ENGINE = Memory; +CREATE TABLE test.template (s1 String, s2 String, `s 3` String, "s 4" String, n UInt64, d Date) ENGINE = Memory; INSERT INTO test.template VALUES ('qwe,rty', 'as"df''gh', '', 'zx\ncv\tbn m', 123, '2016-01-01'),('as"df''gh', '', 'zx\ncv\tbn m', 'qwe,rty', 456, '2016-01-02'),('', 'zx\ncv\tbn m', 'qwe,rty', 'as"df''gh', 9876543210, '2016-01-03'),('zx\ncv\tbn m', 'qwe,rty', 'as"df''gh', '', 789, '2016-01-04'); SELECT * FROM test.template WITH TOTALS LIMIT 4 FORMAT Template SETTINGS extremes = 1, format_schema = '{prefix} \n${data}\n------\n${totals:}\n------\n${min}\n------\n${max}\n${rows:} rows\nbefore limit ${rows_before_limit:XML}\nread ${rows_read} $$ suffix $$', -format_schema_rows = 'n:\t${n:JSON}, s1:\t${s1:Escaped}, s2:\t${s2:Quoted}, s3:\t${s3:JSON}, s4:\t${s4:CSV}, d:\t${d}, n:\t${n:Raw}\t', +format_schema_rows = 'n:\t${n:JSON}, s1:\t${s1:Escaped}, s2:\t${s2:Quoted}, s3:\t${`s 3`:JSON}, s4:\t${"s 4":CSV}, d:\t${d}, n:\t${n:Raw}\t', format_schema_rows_between_delimiter = ';\n'; DROP TABLE test.template; \ No newline at end of file From cf3a8b993b1895109938675f9e9030211b86167e Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 27 Aug 2019 19:53:26 +0300 Subject: [PATCH 23/43] allow skipping fields in TemplateRowInputFormat --- .../Impl/TemplateBlockOutputFormat.cpp | 25 +- .../Formats/Impl/TemplateBlockOutputFormat.h | 7 +- .../Formats/Impl/TemplateRowInputFormat.cpp | 315 +++++++++++++----- .../Formats/Impl/TemplateRowInputFormat.h | 8 +- .../00937_template_output_format.sql | 14 +- .../00938_template_input_format.sh | 10 +- 6 files changed, 266 insertions(+), 113 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 7ecada13d7d..fb2b96a561e 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -63,7 +63,7 @@ ParsedTemplateFormatString::ParsedTemplateFormatString(const String & format_str state = Format; else if (*pos == '}') { - formats.push_back(ColumnFormat::Default); + formats.push_back(ColumnFormat::None); delimiters.emplace_back(); state = Delimiter; } @@ -94,7 +94,9 @@ ParsedTemplateFormatString::ParsedTemplateFormatString(const String & format_str ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format) { if (col_format.empty()) - return ColumnFormat::Default; + return ColumnFormat::None; + else if (col_format == "None") + return ColumnFormat::None; else if (col_format == "Escaped") return ColumnFormat::Escaped; else if (col_format == "Quoted") @@ -121,8 +123,8 @@ String ParsedTemplateFormatString::formatToString(ParsedTemplateFormatString::Co { switch (format) { - case ColumnFormat::Default: - return "Escaped (Default)"; + case ColumnFormat::None: + return "None"; case ColumnFormat::Escaped: return "Escaped"; case ColumnFormat::Quoted: @@ -192,11 +194,13 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const B case OutputPart::Totals: case OutputPart::ExtremesMin: case OutputPart::ExtremesMax: - if (format.formats[i] != ColumnFormat::Default) + if (format.formats[i] != ColumnFormat::None) throw Exception("invalid template: wrong serialization type for data, totals, min or max", ErrorCodes::INVALID_TEMPLATE_FORMAT); break; default: + if (format.formats[i] == ColumnFormat::None) + throw Exception("Serialization type for output part rows, rows_before_limit, time, rows_read or bytes_read not specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); break; } } @@ -212,9 +216,13 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const B /// Validate format string for rows if (row_format.delimiters.size() == 1) throw Exception("invalid template: no columns specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); - for (const auto & idx_mapping : row_format.format_idx_to_column_idx) - if (!idx_mapping) + for (size_t i = 0; i < row_format.columnsCount(); ++i) + { + if (!row_format.format_idx_to_column_idx[i]) throw Exception("Cannot skip format field for output, it's a bug.", ErrorCodes::LOGICAL_ERROR); + if (row_format.formats[i] == ColumnFormat::None) + throw Exception("Serialization type for file column " + std::to_string(i) + " not specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); + } } TemplateBlockOutputFormat::OutputPart TemplateBlockOutputFormat::stringToOutputPart(const String & part) @@ -258,7 +266,6 @@ void TemplateBlockOutputFormat::serializeField(const IColumn & column, const IDa { switch (col_format) { - case ColumnFormat::Default: case ColumnFormat::Escaped: type.serializeAsTextEscaped(column, row_num, out, settings); break; @@ -277,6 +284,8 @@ void TemplateBlockOutputFormat::serializeField(const IColumn & column, const IDa case ColumnFormat::Raw: type.serializeAsText(column, row_num, out, settings); break; + default: + __builtin_unreachable(); } } diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h index 292e527ae16..1f0e2b1cf58 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h @@ -13,7 +13,7 @@ struct ParsedTemplateFormatString { enum class ColumnFormat { - Default, + None, Escaped, Quoted, Csv, @@ -21,6 +21,11 @@ struct ParsedTemplateFormatString Xml, Raw }; + + /// Format string has syntax: "Delimiter0 ${ColumnName0:Format0} Delimiter1 ${ColumnName1:Format1} Delimiter2" + /// The following vectors is filled with corresponding values, delimiters.size() - 1 = formats.size() = format_idx_to_column_idx.size() + /// If format_idx_to_column_idx[i] has no value, then TemplateRowInputStream will skip i-th column. + std::vector delimiters; std::vector formats; std::vector> format_idx_to_column_idx; diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 8cef30b4ae1..3ceffe65288 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB { @@ -11,6 +12,8 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT; extern const int ATTEMPT_TO_READ_AFTER_EOF; extern const int CANNOT_READ_ALL_DATA; +extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; +extern const int CANNOT_PARSE_QUOTED_STRING; } @@ -19,50 +22,136 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h : RowInputFormatWithDiagnosticInfo(header_, in_, params_), buf(in_), data_types(header_.getDataTypes()), settings(settings_), ignore_spaces(ignore_spaces_) { + /// Parse format string for whole input static const String default_format("${data}"); const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; - format = ParsedTemplateFormatString(format_str, [&](const String & partName) + format = ParsedTemplateFormatString(format_str, [&](const String & partName) -> std::optional { if (partName == "data") return 0; + else if (partName.empty()) /// For skipping some values in prefix and suffix + return {}; throw Exception("invalid template format: unknown input part " + partName, ErrorCodes::INVALID_TEMPLATE_FORMAT); }); - if (format.formats.size() != 1 || format.formats[0] != ColumnFormat::Default) - throw Exception("invalid template format: format_schema must be \"prefix ${data} suffix\"", ErrorCodes::INVALID_TEMPLATE_FORMAT); - - - row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) + /// Validate format string for whole input + bool has_data = false; + for (size_t i = 0; i < format.columnsCount(); ++i) { + if (format.format_idx_to_column_idx[i]) + { + if (has_data) + throw Exception("${data} can occur only once", ErrorCodes::INVALID_TEMPLATE_FORMAT); + if (format.formats[i] != ColumnFormat::None) + throw Exception("invalid template format: ${data} must have empty or None serialization type", ErrorCodes::INVALID_TEMPLATE_FORMAT); + has_data = true; + format_data_idx = i; + } + else + { + if (format.formats[i] == ColumnFormat::None || format.formats[i] == ColumnFormat::Xml || format.formats[i] == ColumnFormat::Raw) + throw Exception("None, XML and Raw deserialization is not supported", ErrorCodes::INVALID_TEMPLATE_FORMAT); + } + } + + /// Parse format string for rows + row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) -> std::optional + { + if (colName.empty()) + return {}; return header_.getPositionByName(colName); }); + /// Validate format string for rows std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - size_t col_idx = row_format.format_idx_to_column_idx[i]; - if (column_in_format[col_idx]) - throw Exception("invalid template format: duplicate column " + header_.getColumnsWithTypeAndName()[col_idx].name, - ErrorCodes::INVALID_TEMPLATE_FORMAT); - column_in_format[col_idx] = true; + if (row_format.formats[i] == ColumnFormat::None || row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw) + throw Exception("invalid template format: None, XML and Raw deserialization is not supported", ErrorCodes::INVALID_TEMPLATE_FORMAT); - if (row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw) - throw Exception("invalid template format: XML and Raw deserialization is not supported", ErrorCodes::INVALID_TEMPLATE_FORMAT); + if (format.format_idx_to_column_idx[i]) + { + size_t col_idx = *row_format.format_idx_to_column_idx[i]; + if (column_in_format[col_idx]) + throw Exception("invalid template format: duplicate column " + header_.getColumnsWithTypeAndName()[col_idx].name, + ErrorCodes::INVALID_TEMPLATE_FORMAT); + column_in_format[col_idx] = true; + } } } void TemplateRowInputFormat::readPrefix() { + tryReadPrefixOrSuffix(0, format_data_idx); +} + +/// Asserts delimiters and skips fields in prefix or suffix. +/// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row +/// (most likely false will be returned on first call of checkString(...)) +template +ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t input_part_beg, size_t input_part_end) +{ + static constexpr bool throw_exception = std::is_same_v; + skipSpaces(); - assertString(format.delimiters.front(), buf); + if constexpr (throw_exception) + assertString(format.delimiters[input_part_end], buf); + else + { + if (likely(!checkString(format.delimiters[input_part_end], buf))) + return ReturnType(false); + } + + while (input_part_beg < input_part_end) + { + skipSpaces(); + if constexpr (throw_exception) + skipField(format.formats[input_part_beg]); + else + { + try + { + skipField(format.formats[input_part_beg]); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && + e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) + throw; + /// If it's parsing error, then suffix is not found + return ReturnType(false); + } + } + ++input_part_beg; + + skipSpaces(); + if constexpr (throw_exception) + assertString(format.delimiters[input_part_beg], buf); + else + { + if (likely(!checkString(format.delimiters[input_part_beg], buf))) + return ReturnType(false); + } + } + + if constexpr (!throw_exception) + return ReturnType(true); } bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra) { + /// This function can be called again after it returned false + if (unlikely(end_of_stream)) + return false; + skipSpaces(); - if (checkForSuffix()) + if (unlikely(checkForSuffix())) + { + end_of_stream = true; return false; + } updateDiagnosticInfo(); @@ -75,10 +164,16 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension { skipSpaces(); assertString(row_format.delimiters[i], buf); - size_t col_idx = row_format.format_idx_to_column_idx[i]; skipSpaces(); - deserializeField(*data_types[col_idx], *columns[col_idx], row_format.formats[i]); - extra.read_columns[col_idx] = true; + if (row_format.format_idx_to_column_idx[i]) + { + size_t col_idx = *row_format.format_idx_to_column_idx[i]; + deserializeField(*data_types[col_idx], *columns[col_idx], row_format.formats[i]); + extra.read_columns[col_idx] = true; + } + else + skipField(row_format.formats[i]); + } skipSpaces(); @@ -86,7 +181,7 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension for (size_t i = 0; i < columns.size(); ++i) if (!extra.read_columns[i]) - data_types[row_format.format_idx_to_column_idx[i]]->insertDefaultInto(*columns[i]); + data_types[i]->insertDefaultInto(*columns[i]); return true; } @@ -97,7 +192,6 @@ void TemplateRowInputFormat::deserializeField(const IDataType & type, IColumn & { switch (col_format) { - case ColumnFormat::Default: case ColumnFormat::Escaped: type.deserializeAsTextEscaped(column, buf, settings); break; @@ -111,7 +205,7 @@ void TemplateRowInputFormat::deserializeField(const IDataType & type, IColumn & type.deserializeAsTextJSON(column, buf, settings); break; default: - break; + __builtin_unreachable(); } } catch (Exception & e) @@ -122,54 +216,69 @@ void TemplateRowInputFormat::deserializeField(const IDataType & type, IColumn & } } -/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignnore_spaces == true) before EOF. +void TemplateRowInputFormat::skipField(TemplateRowInputFormat::ColumnFormat col_format) +{ + String tmp; + try + { + switch (col_format) + { + case ColumnFormat::None: + /// Empty field, just skip spaces + break; + case ColumnFormat::Escaped: + readEscapedString(tmp, buf); + break; + case ColumnFormat::Quoted: + readQuotedString(tmp, buf); + break; + case ColumnFormat::Csv: + readCSVString(tmp, buf, settings.csv); + break; + case ColumnFormat::Json: + readJSONString(tmp, buf); + break; + default: + __builtin_unreachable(); + } + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + throwUnexpectedEof(); + throw; + } +} + +/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF. /// Otherwise returns false bool TemplateRowInputFormat::checkForSuffix() { - if (unlikely(synced_after_error_at_last_row)) - return true; - - StringRef suffix(format.delimiters.back()); - if (likely(!compareSuffixPart(suffix, buf.position(), buf.available()))) - return false; - - while (buf.peekNext()) + PeekableReadBufferCheckpoint checkpoint{buf}; + bool suffix_found = false; + try { - BufferBase::Buffer peeked = buf.lastPeeked(); - if (likely(!compareSuffixPart(suffix, peeked.begin(), peeked.size()))) - return false; + suffix_found = tryReadPrefixOrSuffix(format_data_idx + 1, format.columnsCount()); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && + e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) + throw; } - if (suffix.size) - throwUnexpectedEof(); - return true; -} + /// TODO better diagnostic in case of invalid suffix -/// Returns true if buffer contains only suffix and maybe some spaces after it -/// If there are not enough data in buffer, compares available data and removes it from reference to suffix -bool TemplateRowInputFormat::compareSuffixPart(StringRef & suffix, BufferBase::Position pos, size_t available) -{ - if (suffix.size < available) + if (unlikely(suffix_found)) { - if (!ignore_spaces) - return false; - if (likely(suffix != StringRef(pos, suffix.size))) - return false; - - BufferBase::Position end = pos + available; - pos += suffix.size; - suffix.size = 0; - while (pos != end) - if (!isWhitespaceASCII(*pos++)) - return false; - return true; + skipSpaces(); + if (buf.eof()) + return true; } - if (likely(StringRef(suffix.data, available) != StringRef(pos, available))) - return false; - suffix.data += available; - suffix.size -= available; - return true; + buf.rollbackToCheckpoint(); + return false; } bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) @@ -199,13 +308,25 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col } skipSpaces(); - auto & header = getPort().getHeader(); - size_t col_idx = row_format.format_idx_to_column_idx[i]; - if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], out, i)) + if (row_format.format_idx_to_column_idx[i]) { - out << "Maybe it's not possible to deserialize field " + std::to_string(i) + - " as " + ParsedTemplateFormatString::formatToString(row_format.formats[i]); - return false; + auto & header = getPort().getHeader(); + size_t col_idx = *row_format.format_idx_to_column_idx[i]; + if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], + *columns[col_idx], out, i)) + { + out << "Maybe it's not possible to deserialize field " + std::to_string(i) + + " as " + ParsedTemplateFormatString::formatToString(row_format.formats[i]); + return false; + } + } + else + { + static const String skipped_column_str = ""; + static const DataTypePtr skipped_column_type = std::make_shared(); + static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); + if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, i)) + return false; } } @@ -239,7 +360,10 @@ void TemplateRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColu ReadBuffer::Position & curr_pos) { prev_pos = buf.position(); - deserializeField(*type, column, row_format.formats[input_position]); + if (row_format.format_idx_to_column_idx[input_position]) + deserializeField(*type, column, row_format.formats[input_position]); + else + skipField(row_format.formats[input_position]); curr_pos = buf.position(); } @@ -256,31 +380,45 @@ bool TemplateRowInputFormat::allowSyncAfterError() const void TemplateRowInputFormat::syncAfterError() { - skipToNextDelimiterOrEof(row_format.delimiters.back()); - if (buf.eof()) + bool at_beginning_of_row_or_eof = false; + while (!at_beginning_of_row_or_eof) { - synced_after_error_at_last_row = true; - return; + skipToNextDelimiterOrEof(row_format.delimiters.back()); + if (buf.eof()) + { + end_of_stream = true; + return; + } + buf.ignore(row_format.delimiters.back().size()); + + skipSpaces(); + if (checkForSuffix()) + return; + + bool last_delimiter_in_row_found = !row_format.delimiters.back().empty(); + + if (last_delimiter_in_row_found && checkString(settings.template_settings.row_between_delimiter, buf)) + at_beginning_of_row_or_eof = true; + else + skipToNextDelimiterOrEof(settings.template_settings.row_between_delimiter); + + if (buf.eof()) + at_beginning_of_row_or_eof = end_of_stream = true; } - buf.ignore(row_format.delimiters.back().size()); - - skipSpaces(); - if (checkForSuffix()) - return; - - skipToNextDelimiterOrEof(settings.template_settings.row_between_delimiter); - if (buf.eof()) - synced_after_error_at_last_row = true; + /// It can happen that buf.position() is not at the beginning of row + /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter. + /// It will cause another parsing error. } /// Searches for delimiter in input stream and sets buffer position to the beginning of delimiter (if found) or EOF (if not) void TemplateRowInputFormat::skipToNextDelimiterOrEof(const String & delimiter) { - StringRef delim(delimiter); - if (!delim.size) return; + if (delimiter.empty()) + return; + while (!buf.eof()) { - void* pos = memchr(buf.position(), *delim.data, buf.available()); + void * pos = memchr(buf.position(), delimiter[0], buf.available()); if (!pos) { buf.position() += buf.available(); @@ -289,15 +427,12 @@ void TemplateRowInputFormat::skipToNextDelimiterOrEof(const String & delimiter) buf.position() = static_cast(pos); - /// Peek data until we can compare it with whole delim - while (buf.available() < delim.size && buf.peekNext()); - - if (buf.available() < delim.size) - buf.position() += buf.available(); /// EOF, there is no delim - else if (delim != StringRef(buf.position(), delim.size)) - ++buf.position(); - else + PeekableReadBufferCheckpoint checkpoint{buf}; + if (checkString(delimiter, buf)) return; + + buf.rollbackToCheckpoint(); + ++buf.position(); } } diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h index a33ea8bd3ed..408fcf26203 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -29,11 +29,13 @@ public: private: void deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format); + void skipField(ColumnFormat col_format); inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(buf); } + template + ReturnType tryReadPrefixOrSuffix(size_t input_part_beg, size_t input_part_end); bool checkForSuffix(); [[noreturn]] void throwUnexpectedEof(); - bool compareSuffixPart(StringRef & suffix, BufferBase::Position pos, size_t available); bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, @@ -51,7 +53,9 @@ private: ParsedTemplateFormatString format; ParsedTemplateFormatString row_format; const bool ignore_spaces; - bool synced_after_error_at_last_row = false; + + size_t format_data_idx; + bool end_of_stream = false; }; } diff --git a/dbms/tests/queries/0_stateless/00937_template_output_format.sql b/dbms/tests/queries/0_stateless/00937_template_output_format.sql index 119b6f61717..7a981c641da 100644 --- a/dbms/tests/queries/0_stateless/00937_template_output_format.sql +++ b/dbms/tests/queries/0_stateless/00937_template_output_format.sql @@ -1,12 +1,12 @@ -DROP TABLE IF EXISTS test.template; -CREATE TABLE test.template (s1 String, s2 String, `s 3` String, "s 4" String, n UInt64, d Date) ENGINE = Memory; -INSERT INTO test.template VALUES +DROP TABLE IF EXISTS template; +CREATE TABLE template (s1 String, s2 String, `s 3` String, "s 4" String, n UInt64, d Date) ENGINE = Memory; +INSERT INTO template VALUES ('qwe,rty', 'as"df''gh', '', 'zx\ncv\tbn m', 123, '2016-01-01'),('as"df''gh', '', 'zx\ncv\tbn m', 'qwe,rty', 456, '2016-01-02'),('', 'zx\ncv\tbn m', 'qwe,rty', 'as"df''gh', 9876543210, '2016-01-03'),('zx\ncv\tbn m', 'qwe,rty', 'as"df''gh', '', 789, '2016-01-04'); -SELECT * FROM test.template WITH TOTALS LIMIT 4 FORMAT Template SETTINGS +SELECT * FROM template WITH TOTALS LIMIT 4 FORMAT Template SETTINGS extremes = 1, -format_schema = '{prefix} \n${data}\n------\n${totals:}\n------\n${min}\n------\n${max}\n${rows:} rows\nbefore limit ${rows_before_limit:XML}\nread ${rows_read} $$ suffix $$', -format_schema_rows = 'n:\t${n:JSON}, s1:\t${s1:Escaped}, s2:\t${s2:Quoted}, s3:\t${`s 3`:JSON}, s4:\t${"s 4":CSV}, d:\t${d}, n:\t${n:Raw}\t', +format_schema = '{prefix} \n${data:None}\n------\n${totals:}\n------\n${min}\n------\n${max}\n${rows:Escaped} rows\nbefore limit ${rows_before_limit:XML}\nread ${rows_read:Escaped} $$ suffix $$', +format_schema_rows = 'n:\t${n:JSON}, s1:\t${s1:Escaped}, s2:\t${s2:Quoted}, s3:\t${`s 3`:JSON}, s4:\t${"s 4":CSV}, d:\t${d:Escaped}, n:\t${n:Raw}\t', format_schema_rows_between_delimiter = ';\n'; -DROP TABLE test.template; \ No newline at end of file +DROP TABLE template; diff --git a/dbms/tests/queries/0_stateless/00938_template_input_format.sh b/dbms/tests/queries/0_stateless/00938_template_input_format.sh index c397c901f22..9529f829b05 100755 --- a/dbms/tests/queries/0_stateless/00938_template_input_format.sh +++ b/dbms/tests/queries/0_stateless/00938_template_input_format.sh @@ -3,8 +3,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . $CURDIR/../shell_config.sh -$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.template"; -$CLICKHOUSE_CLIENT --query="CREATE TABLE test.template (s1 String, s2 String, s3 String, s4 String, n UInt64, d Date) ENGINE = Memory"; +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS template"; +$CLICKHOUSE_CLIENT --query="CREATE TABLE template (s1 String, s2 String, s3 String, s4 String, n UInt64, d Date) ENGINE = Memory"; echo "{prefix} n: 123, s1: qwe,rty , s2: 'as\"df\\'gh', s3: \"\", s4: \"zx @@ -12,7 +12,7 @@ cv bn m\", d: 2016-01-01 ; n: 456, s1: as\"df\\'gh , s2: '', s3: \"zx\\ncv\\tbn m\", s4: \"qwe,rty\", d: 2016-01-02 ; n: 9876543210, s1: , s2: 'zx\\ncv\\tbn m', s3: \"qwe,rty\", s4: \"as\"\"df'gh\", d: 2016-01-03 ; n: 789, s1: zx\\ncv\\tbn m , s2: 'qwe,rty', s3: \"as\\\"df'gh\", s4: \"\", d: 2016-01-04 - $ suffix $" | $CLICKHOUSE_CLIENT --query="INSERT INTO test.template FORMAT Template SETTINGS format_schema = '{prefix} \n\${data}\n \$\$ suffix \$\$\n', format_schema_rows = 'n:\t\${n}, s1:\t\${s1:Escaped}\t, s2:\t\${s2:Quoted}, s3:\t\${s3:JSON}, s4:\t\${s4:CSV}, d:\t\${d}\t', format_schema_rows_between_delimiter = ';\n'"; + $ suffix $" | $CLICKHOUSE_CLIENT --query="INSERT INTO template FORMAT Template SETTINGS format_schema = '{prefix} \n\${data}\n \$\$ suffix \$\$\n', format_schema_rows = 'n:\t\${n:Escaped}, s1:\t\${s1:Escaped}\t, s2:\t\${s2:Quoted}, s3:\t\${s3:JSON}, s4:\t\${s4:CSV}, d:\t\${d:Escaped}\t', format_schema_rows_between_delimiter = ';\n'"; -$CLICKHOUSE_CLIENT --query="SELECT * FROM test.template ORDER BY n FORMAT CSV"; -$CLICKHOUSE_CLIENT --query="DROP TABLE test.template"; +$CLICKHOUSE_CLIENT --query="SELECT * FROM template ORDER BY n FORMAT CSV"; +$CLICKHOUSE_CLIENT --query="DROP TABLE template"; From 93c672aa0be35f8ba5aad1c5ca9912001c09c1ca Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 27 Aug 2019 21:29:56 +0300 Subject: [PATCH 24/43] delete BlockInputStreamFromRowInputStream --- .gitignore | 1 - .../BlockInputStreamFromRowInputStream.cpp | 178 ------------------ .../BlockInputStreamFromRowInputStream.h | 62 ------ dbms/src/Formats/tests/CMakeLists.txt | 3 - .../Formats/tests/block_row_transforms.cpp | 57 ------ .../Formats/tests/tab_separated_streams.cpp | 12 +- .../Processors/Formats/IRowInputFormat.cpp | 2 +- 7 files changed, 8 insertions(+), 307 deletions(-) delete mode 100644 dbms/src/Formats/BlockInputStreamFromRowInputStream.cpp delete mode 100644 dbms/src/Formats/BlockInputStreamFromRowInputStream.h delete mode 100644 dbms/src/Formats/tests/block_row_transforms.cpp diff --git a/.gitignore b/.gitignore index 5dc058c97c0..817e333d833 100644 --- a/.gitignore +++ b/.gitignore @@ -90,7 +90,6 @@ dbms/src/Core/tests/field dbms/src/Core/tests/rvo_test dbms/src/Core/tests/string_pool dbms/src/DataStreams/tests/aggregating_stream -dbms/src/DataStreams/tests/block_row_transforms dbms/src/DataStreams/tests/block_tab_separated_streams dbms/src/DataStreams/tests/collapsing_sorted_stream dbms/src/DataStreams/tests/expression_stream diff --git a/dbms/src/Formats/BlockInputStreamFromRowInputStream.cpp b/dbms/src/Formats/BlockInputStreamFromRowInputStream.cpp deleted file mode 100644 index fc38b476e0b..00000000000 --- a/dbms/src/Formats/BlockInputStreamFromRowInputStream.cpp +++ /dev/null @@ -1,178 +0,0 @@ -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED; - extern const int CANNOT_PARSE_QUOTED_STRING; - extern const int CANNOT_PARSE_DATE; - extern const int CANNOT_PARSE_DATETIME; - extern const int CANNOT_READ_ARRAY_FROM_TEXT; - extern const int CANNOT_PARSE_NUMBER; - extern const int CANNOT_PARSE_UUID; - extern const int TOO_LARGE_STRING_SIZE; - extern const int CANNOT_READ_ALL_DATA; - extern const int INCORRECT_DATA; - extern const int INCORRECT_NUMBER_OF_COLUMNS; -} - - -BlockInputStreamFromRowInputStream::BlockInputStreamFromRowInputStream( - const RowInputStreamPtr & row_input_, - const Block & sample_, - UInt64 max_block_size_, - UInt64 rows_portion_size_, - FormatFactory::ReadCallback callback, - const FormatSettings & settings) - : row_input(row_input_) - , sample(sample_) - , max_block_size(max_block_size_) - , rows_portion_size(rows_portion_size_) - , read_virtual_columns_callback(callback) - , allow_errors_num(settings.input_allow_errors_num) - , allow_errors_ratio(settings.input_allow_errors_ratio) -{ -} - - -static bool isParseError(int code) -{ - return code == ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED - || code == ErrorCodes::CANNOT_PARSE_QUOTED_STRING - || code == ErrorCodes::CANNOT_PARSE_DATE - || code == ErrorCodes::CANNOT_PARSE_DATETIME - || code == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT - || code == ErrorCodes::CANNOT_PARSE_NUMBER - || code == ErrorCodes::CANNOT_PARSE_UUID - || code == ErrorCodes::TOO_LARGE_STRING_SIZE - || code == ErrorCodes::CANNOT_READ_ALL_DATA - || code == ErrorCodes::INCORRECT_DATA; -} - - -Block BlockInputStreamFromRowInputStream::readImpl() -{ - size_t num_columns = sample.columns(); - MutableColumns columns = sample.cloneEmptyColumns(); - block_missing_values.clear(); - - try - { - for (size_t rows = 0, batch = 0; rows < max_block_size; ++rows, ++batch) - { - if (rows_portion_size && batch == rows_portion_size) - { - batch = 0; - if (!checkTimeLimit() || isCancelled()) - break; - } - - try - { - ++total_rows; - RowReadExtension info_; - if (!row_input->read(columns, info_)) - break; - if (read_virtual_columns_callback) - read_virtual_columns_callback(); - - for (size_t column_idx = 0; column_idx < info_.read_columns.size(); ++column_idx) - { - if (!info_.read_columns[column_idx]) - { - size_t column_size = columns[column_idx]->size(); - if (column_size == 0) - throw Exception("Unexpected empty column", ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS); - block_missing_values.setBit(column_idx, column_size - 1); - } - } - } - catch (Exception & e) - { - /// Logic for possible skipping of errors. - - if (!isParseError(e.code())) - throw; - - if (allow_errors_num == 0 && allow_errors_ratio == 0) - throw; - - ++num_errors; - Float32 current_error_ratio = static_cast(num_errors) / total_rows; - - if (num_errors > allow_errors_num - && current_error_ratio > allow_errors_ratio) - { - e.addMessage("(Already have " + toString(num_errors) + " errors" - " out of " + toString(total_rows) + " rows" - ", which is " + toString(current_error_ratio) + " of all rows)"); - throw; - } - - if (!row_input->allowSyncAfterError()) - { - e.addMessage("(Input format doesn't allow to skip errors)"); - throw; - } - - row_input->syncAfterError(); - - /// Truncate all columns in block to minimal size (remove values, that was appended to only part of columns). - - size_t min_size = std::numeric_limits::max(); - for (size_t column_idx = 0; column_idx < num_columns; ++column_idx) - min_size = std::min(min_size, columns[column_idx]->size()); - - for (size_t column_idx = 0; column_idx < num_columns; ++column_idx) - { - auto & column = columns[column_idx]; - if (column->size() > min_size) - column->popBack(column->size() - min_size); - } - } - } - } - catch (Exception & e) - { - if (!isParseError(e.code())) - throw; - - String verbose_diagnostic; - try - { - verbose_diagnostic = row_input->getDiagnosticInfo(); - } - catch (...) - { - /// Error while trying to obtain verbose diagnostic. Ok to ignore. - } - - e.addMessage("(at row " + toString(total_rows) + ")\n" + verbose_diagnostic); - throw; - } - - if (columns.empty() || columns[0]->empty()) - return {}; - - return sample.cloneWithColumns(std::move(columns)); -} - - -void BlockInputStreamFromRowInputStream::readSuffix() -{ - if (allow_errors_num > 0 || allow_errors_ratio > 0) - { - Logger * log = &Logger::get("BlockInputStreamFromRowInputStream"); - LOG_TRACE(log, "Skipped " << num_errors << " rows with errors while reading the input stream"); - } - - row_input->readSuffix(); -} - -} diff --git a/dbms/src/Formats/BlockInputStreamFromRowInputStream.h b/dbms/src/Formats/BlockInputStreamFromRowInputStream.h deleted file mode 100644 index 2338af3bf38..00000000000 --- a/dbms/src/Formats/BlockInputStreamFromRowInputStream.h +++ /dev/null @@ -1,62 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - - -namespace DB -{ - -/** Makes block-oriented stream on top of row-oriented stream. - * It is used to read data from text formats. - * - * Also controls over parsing errors and prints diagnostic information about them. - */ -class BlockInputStreamFromRowInputStream : public IBlockInputStream -{ -public: - /// |sample| is a block with zero rows, that structure describes how to interpret values - /// |rows_portion_size| is a number of rows to read before break and check limits - BlockInputStreamFromRowInputStream( - const RowInputStreamPtr & row_input_, - const Block & sample_, - UInt64 max_block_size_, - UInt64 rows_portion_size_, - FormatFactory::ReadCallback callback, - const FormatSettings & settings); - - void readPrefix() override { row_input->readPrefix(); } - void readSuffix() override; - - String getName() const override { return "BlockInputStreamFromRowInputStream"; } - - RowInputStreamPtr & getRowInput() { return row_input; } - - Block getHeader() const override { return sample; } - - const BlockMissingValues & getMissingValues() const override { return block_missing_values; } - -protected: - Block readImpl() override; - -private: - RowInputStreamPtr row_input; - Block sample; - UInt64 max_block_size; - UInt64 rows_portion_size; - - /// Callback used to setup virtual columns after reading each row. - FormatFactory::ReadCallback read_virtual_columns_callback; - - BlockMissingValues block_missing_values; - - UInt64 allow_errors_num; - Float32 allow_errors_ratio; - - size_t total_rows = 0; - size_t num_errors = 0; -}; -} diff --git a/dbms/src/Formats/tests/CMakeLists.txt b/dbms/src/Formats/tests/CMakeLists.txt index e12fa0f02fb..187700dff72 100644 --- a/dbms/src/Formats/tests/CMakeLists.txt +++ b/dbms/src/Formats/tests/CMakeLists.txt @@ -2,6 +2,3 @@ set(SRCS ) add_executable (tab_separated_streams tab_separated_streams.cpp ${SRCS}) target_link_libraries (tab_separated_streams PRIVATE dbms) - -add_executable (block_row_transforms block_row_transforms.cpp ${SRCS}) -target_link_libraries (block_row_transforms PRIVATE dbms) diff --git a/dbms/src/Formats/tests/block_row_transforms.cpp b/dbms/src/Formats/tests/block_row_transforms.cpp deleted file mode 100644 index 9edc520d85f..00000000000 --- a/dbms/src/Formats/tests/block_row_transforms.cpp +++ /dev/null @@ -1,57 +0,0 @@ -#include - -#include -#include - -#include -#include - -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include - - -int main(int, char **) -try -{ - using namespace DB; - - Block sample; - - ColumnWithTypeAndName col1; - col1.name = "col1"; - col1.type = std::make_shared(); - col1.column = col1.type->createColumn(); - sample.insert(col1); - - ColumnWithTypeAndName col2; - col2.name = "col2"; - col2.type = std::make_shared(); - col2.column = col2.type->createColumn(); - sample.insert(col2); - - ReadBufferFromFile in_buf("test_in"); - WriteBufferFromFile out_buf("test_out"); - - FormatSettings format_settings; - - RowInputStreamPtr row_input = std::make_shared(in_buf, sample, false, false, format_settings); - BlockInputStreamFromRowInputStream block_input(row_input, sample, DEFAULT_INSERT_BLOCK_SIZE, 0, []{}, format_settings); - BlockOutputStreamPtr block_output = std::make_shared(std::make_shared(out_buf, sample, false, false, []{}, format_settings)); - - copyData(block_input, *block_output); -} -catch (const DB::Exception & e) -{ - std::cerr << e.what() << ", " << e.displayText() << std::endl; - return 1; -} diff --git a/dbms/src/Formats/tests/tab_separated_streams.cpp b/dbms/src/Formats/tests/tab_separated_streams.cpp index 2c44ccf2b43..46999f6e594 100644 --- a/dbms/src/Formats/tests/tab_separated_streams.cpp +++ b/dbms/src/Formats/tests/tab_separated_streams.cpp @@ -9,12 +9,12 @@ #include #include -#include -#include +#include #include #include #include +#include using namespace DB; @@ -39,13 +39,15 @@ try FormatSettings format_settings; - RowInputStreamPtr row_input = std::make_shared(in_buf, sample, false, false, format_settings); - BlockInputStreamFromRowInputStream block_input(row_input, sample, DEFAULT_INSERT_BLOCK_SIZE, 0, []{}, format_settings); + RowInputFormatParams params{DEFAULT_INSERT_BLOCK_SIZE, 0, 0, 0, []{}}; + + InputFormatPtr input_format = std::make_shared(in_buf, sample, false, false, params, format_settings); + BlockInputStreamPtr block_input = std::make_shared(std::move(input_format)); BlockOutputStreamPtr block_output = std::make_shared( std::make_shared(out_buf, sample, false, false, [] {}, format_settings)); - copyData(block_input, *block_output); + copyData(*block_input, *block_output); return 0; } catch (...) diff --git a/dbms/src/Processors/Formats/IRowInputFormat.cpp b/dbms/src/Processors/Formats/IRowInputFormat.cpp index 2860587cbf2..b45c714ea07 100644 --- a/dbms/src/Processors/Formats/IRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/IRowInputFormat.cpp @@ -174,7 +174,7 @@ Chunk IRowInputFormat::generate() { if (params.allow_errors_num > 0 || params.allow_errors_ratio > 0) { - Logger * log = &Logger::get("BlockInputStreamFromRowInputStream"); + Logger * log = &Logger::get("IRowInputFormat"); LOG_TRACE(log, "Skipped " << num_errors << " rows with errors while reading the input stream"); } From c8096542bff7bba24ae6e6d8a3725dd10e7e7c88 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 28 Aug 2019 19:46:54 +0300 Subject: [PATCH 25/43] more tests and fixes --- .../Formats/Impl/TemplateRowInputFormat.cpp | 17 ++++++--- .../00938_template_input_format.reference | 17 +++++++++ .../00938_template_input_format.sh | 38 ++++++++++++++++--- 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 3ceffe65288..a0e28e886a1 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -49,7 +49,7 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h } else { - if (format.formats[i] == ColumnFormat::None || format.formats[i] == ColumnFormat::Xml || format.formats[i] == ColumnFormat::Raw) + if (format.formats[i] == ColumnFormat::Xml || format.formats[i] == ColumnFormat::Raw) throw Exception("None, XML and Raw deserialization is not supported", ErrorCodes::INVALID_TEMPLATE_FORMAT); } } @@ -66,11 +66,14 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - if (row_format.formats[i] == ColumnFormat::None || row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw) + if (row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw) throw Exception("invalid template format: None, XML and Raw deserialization is not supported", ErrorCodes::INVALID_TEMPLATE_FORMAT); - if (format.format_idx_to_column_idx[i]) + if (row_format.format_idx_to_column_idx[i]) { + if (row_format.formats[i] == ColumnFormat::None) + throw Exception("invalid template format: None, XML and Raw deserialization is not supported", ErrorCodes::INVALID_TEMPLATE_FORMAT); + size_t col_idx = *row_format.format_idx_to_column_idx[i]; if (column_in_format[col_idx]) throw Exception("invalid template format: duplicate column " + header_.getColumnsWithTypeAndName()[col_idx].name, @@ -95,10 +98,10 @@ ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t input_part_beg, skipSpaces(); if constexpr (throw_exception) - assertString(format.delimiters[input_part_end], buf); + assertString(format.delimiters[input_part_beg], buf); else { - if (likely(!checkString(format.delimiters[input_part_end], buf))) + if (likely(!checkString(format.delimiters[input_part_beg], buf))) return ReturnType(false); } @@ -219,6 +222,8 @@ void TemplateRowInputFormat::deserializeField(const IDataType & type, IColumn & void TemplateRowInputFormat::skipField(TemplateRowInputFormat::ColumnFormat col_format) { String tmp; + constexpr const char * field_name = ""; + constexpr size_t field_name_len = 16; try { switch (col_format) @@ -236,7 +241,7 @@ void TemplateRowInputFormat::skipField(TemplateRowInputFormat::ColumnFormat col_ readCSVString(tmp, buf, settings.csv); break; case ColumnFormat::Json: - readJSONString(tmp, buf); + skipJSONField(buf, StringRef(field_name, field_name_len)); break; default: __builtin_unreachable(); diff --git a/dbms/tests/queries/0_stateless/00938_template_input_format.reference b/dbms/tests/queries/0_stateless/00938_template_input_format.reference index 3947822de3f..ce89532886d 100644 --- a/dbms/tests/queries/0_stateless/00938_template_input_format.reference +++ b/dbms/tests/queries/0_stateless/00938_template_input_format.reference @@ -1,3 +1,4 @@ +==== check escaping ==== "qwe,rty","as""df'gh","","zx cv bn m",123,"2016-01-01" "as""df'gh","","zx @@ -6,3 +7,19 @@ cv bn m","qwe,rty",456,"2016-01-02" cv bn m","qwe,rty","as""df'gh","",789,"2016-01-04" "","zx cv bn m","qwe,rty","as""df'gh",9876543210,"2016-01-03" +==== parse json (sophisticated template) ==== +"qwe,rty","as""df'gh","","zx +cv bn m",123,"2016-01-01" +"as""df'gh","","zx +cv bn m","qwe,rty",456,"2016-01-02" +"zx +cv bn m","qwe,rty","as""df'gh","",789,"2016-01-04" +"","zx +cv bn m","qwe,rty","as""df'gh",9876543210,"2016-01-03" +==== parse json ==== +"","","qwe,rty","",123,"2016-01-01" +"zx +cv bn m","","as""df'gh","",456,"2016-01-02" +"as""df'gh","","zx +cv bn m","",789,"2016-01-04" +"qwe,rty","","","",9876543210,"2016-01-03" diff --git a/dbms/tests/queries/0_stateless/00938_template_input_format.sh b/dbms/tests/queries/0_stateless/00938_template_input_format.sh index 9529f829b05..c33741543e9 100755 --- a/dbms/tests/queries/0_stateless/00938_template_input_format.sh +++ b/dbms/tests/queries/0_stateless/00938_template_input_format.sh @@ -3,8 +3,12 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . $CURDIR/../shell_config.sh -$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS template"; -$CLICKHOUSE_CLIENT --query="CREATE TABLE template (s1 String, s2 String, s3 String, s4 String, n UInt64, d Date) ENGINE = Memory"; +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS template1"; +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS template2"; +$CLICKHOUSE_CLIENT --query="CREATE TABLE template1 (s1 String, s2 String, s3 String, s4 String, n UInt64, d Date) ENGINE = Memory"; +$CLICKHOUSE_CLIENT --query="CREATE TABLE template2 (s1 String, s2 String, s3 String, s4 String, n UInt64, d Date) ENGINE = Memory"; + +echo "==== check escaping ====" echo "{prefix} n: 123, s1: qwe,rty , s2: 'as\"df\\'gh', s3: \"\", s4: \"zx @@ -12,7 +16,31 @@ cv bn m\", d: 2016-01-01 ; n: 456, s1: as\"df\\'gh , s2: '', s3: \"zx\\ncv\\tbn m\", s4: \"qwe,rty\", d: 2016-01-02 ; n: 9876543210, s1: , s2: 'zx\\ncv\\tbn m', s3: \"qwe,rty\", s4: \"as\"\"df'gh\", d: 2016-01-03 ; n: 789, s1: zx\\ncv\\tbn m , s2: 'qwe,rty', s3: \"as\\\"df'gh\", s4: \"\", d: 2016-01-04 - $ suffix $" | $CLICKHOUSE_CLIENT --query="INSERT INTO template FORMAT Template SETTINGS format_schema = '{prefix} \n\${data}\n \$\$ suffix \$\$\n', format_schema_rows = 'n:\t\${n:Escaped}, s1:\t\${s1:Escaped}\t, s2:\t\${s2:Quoted}, s3:\t\${s3:JSON}, s4:\t\${s4:CSV}, d:\t\${d:Escaped}\t', format_schema_rows_between_delimiter = ';\n'"; + $ suffix $" | $CLICKHOUSE_CLIENT --query="INSERT INTO template1 FORMAT Template SETTINGS \ +format_schema = '{prefix} \n\${data}\n \$\$ suffix \$\$\n', \ +format_schema_rows = 'n:\t\${n:Escaped}, s1:\t\${s1:Escaped}\t, s2:\t\${s2:Quoted}, s3:\t\${s3:JSON}, s4:\t\${s4:CSV}, d:\t\${d:Escaped}\t', \ +format_schema_rows_between_delimiter = ';\n'"; -$CLICKHOUSE_CLIENT --query="SELECT * FROM template ORDER BY n FORMAT CSV"; -$CLICKHOUSE_CLIENT --query="DROP TABLE template"; +$CLICKHOUSE_CLIENT --query="SELECT * FROM template1 ORDER BY n FORMAT CSV"; + +echo "==== parse json (sophisticated template) ====" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template1 ORDER BY n FORMAT JSON" | $CLICKHOUSE_CLIENT --query="INSERT INTO template2 FORMAT TemplateIgnoreSpaces SETTINGS \ +format_schema = '{\${:}\"meta\"\${:}:\${:}[\${:}{\${:}\"name\"\${:}:\${:}\"s1\"\${:},\${:}\"type\"\${:}:\${:}\"String\"\${:}}\${:},\${:}{\${:}\"name\"\${:}:\${:}\"s2\"\${:},\${:}\"type\"\${:}:\${:}\"String\"\${:}}\${:},\${:}{\${:}\"name\"\${:}:\${:}\"s3\"\${:},\${:}\"type\"\${:}:\${:}\"String\"\${:}}\${:},\${:}{\${:}\"name\"\${:}:\${:}\"s4\"\${:},\${:}\"type\"\${:}:\${:}\"String\"\${:}}\${:},\${:}{\${:}\"name\"\${:}:\${:}\"n\"\${:},\${:}\"type\"\${:}:\${:}\"UInt64\"\${:}}\${:},\${:}{\${:}\"name\"\${:}:\${:}\"d\"\${:},\${:}\"type\"\${:}:\${:}\"Date\"\${:}}\${:}]\${:},\${:}\"data\"\${:}:\${:}[\${data}]\${:},\${:}\"rows\"\${:}:\${:}\${:CSV}\${:},\${:}\"statistics\"\${:}:\${:}{\${:}\"elapsed\"\${:}:\${:}\${:CSV}\${:},\${:}\"rows_read\"\${:}:\${:}\${:CSV}\${:},\${:}\"bytes_read\"\${:}:\${:}\${:CSV}\${:}}\${:}}', \ +format_schema_rows = '{\${:}\"s1\"\${:}:\${:}\${s1:JSON}\${:},\${:}\"s2\"\${:}:\${:}\${s2:JSON}\${:},\${:}\"s3\"\${:}:\${:}\${s3:JSON}\${:},\${:}\"s4\"\${:}:\${:}\${s4:JSON}\${:},\${:}\"n\"\${:}:\${:}\${n:JSON}\${:},\${:}\"d\"\${:}:\${:}\${d:JSON}\${:}\${:}}', \ +format_schema_rows_between_delimiter = ','"; + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template2 ORDER BY n FORMAT CSV"; +$CLICKHOUSE_CLIENT --query="TRUNCATE TABLE template2"; + +echo "==== parse json ====" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template1 ORDER BY n FORMAT JSON" | $CLICKHOUSE_CLIENT --query="INSERT INTO template2 FORMAT TemplateIgnoreSpaces SETTINGS \ +format_schema = '{\${:}\"meta\"\${:}:\${:JSON},\${:}\"data\"\${:}:\${:}[\${data}]\${:},\${:}\"rows\"\${:}:\${:JSON},\${:}\"statistics\"\${:}:\${:JSON}\${:}}', \ +format_schema_rows = '{\${:}\"s1\"\${:}:\${:}\${s3:JSON}\${:},\${:}\"s2\"\${:}:\${:}\${:JSON}\${:},\${:}\"s3\"\${:}:\${:}\${s1:JSON}\${:},\${:}\"s4\"\${:}:\${:}\${:JSON}\${:},\${:}\"n\"\${:}:\${:}\${n:JSON}\${:},\${:}\"d\"\${:}:\${:}\${d:JSON}\${:}\${:}}', \ +format_schema_rows_between_delimiter = ','"; + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template2 ORDER BY n FORMAT CSV"; + +$CLICKHOUSE_CLIENT --query="DROP TABLE template1"; +$CLICKHOUSE_CLIENT --query="DROP TABLE template2"; From d6ca9fee63e0ab1594d0c7f7a70c1c5c50a5de83 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 29 Aug 2019 16:30:43 +0300 Subject: [PATCH 26/43] unit test for PeekableReadBuffer --- dbms/src/IO/PeekableReadBuffer.cpp | 66 ++++++++-- dbms/src/IO/PeekableReadBuffer.h | 20 +++- dbms/src/IO/tests/CMakeLists.txt | 3 + dbms/src/IO/tests/peekable_read_buffer.cpp | 133 +++++++++++++++++++++ 4 files changed, 203 insertions(+), 19 deletions(-) create mode 100644 dbms/src/IO/tests/peekable_read_buffer.cpp diff --git a/dbms/src/IO/PeekableReadBuffer.cpp b/dbms/src/IO/PeekableReadBuffer.cpp index f624d8b8104..0b6b1f5805c 100644 --- a/dbms/src/IO/PeekableReadBuffer.cpp +++ b/dbms/src/IO/PeekableReadBuffer.cpp @@ -3,10 +3,11 @@ namespace DB { -PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t unread_limit_ /* = default_limit*/) - : sub_buf(sub_buf_), unread_limit(unread_limit_) +PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ /*= DBMS_DEFAULT_BUFFER_SIZE*/, + size_t unread_limit_ /* = default_limit*/) + : BufferWithOwnMemory(start_size_), sub_buf(sub_buf_), unread_limit(unread_limit_) { - padded = sub_buf.isPadded(); + padded &= sub_buf.isPadded(); /// Read from sub-buffer Buffer & sub_working = sub_buf.buffer(); BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset()); @@ -19,18 +20,21 @@ bool PeekableReadBuffer::peekNext() { checkStateCorrect(); + size_t bytes_read = 0; + Position copy_from = pos; size_t bytes_to_copy = sub_buf.available(); if (useSubbufferOnly()) { /// Don't have to copy all data from sub-buffer if there is no data in own memory (checkpoint and pos are in sub-buffer) - Position copy_from = pos; if (checkpoint) copy_from = checkpoint; - bytes += copy_from - sub_buf.buffer().begin(); - sub_buf.position() = copy_from; - bytes_to_copy = sub_buf.available(); + bytes_read = copy_from - sub_buf.buffer().begin(); + bytes_to_copy = sub_buf.buffer().end() - copy_from; // sub_buf.available(); if (!bytes_to_copy) { + bytes += bytes_read; + sub_buf.position() = copy_from; + /// Both checkpoint and pos are at the end of sub-buffer. Just load next part of data. bool res = sub_buf.next(); BufferBase::set(sub_buf.buffer().begin(), sub_buf.buffer().size(), sub_buf.offset()); @@ -42,8 +46,15 @@ bool PeekableReadBuffer::peekNext() } } + /// May throw an exception resizeOwnMemoryIfNecessary(bytes_to_copy); + if (useSubbufferOnly()) + { + bytes += bytes_read; + sub_buf.position() = copy_from; + } + /// Save unread data from sub-buffer to own memory memcpy(memory.data() + peeked_size, sub_buf.position(), bytes_to_copy); @@ -83,6 +94,11 @@ bool PeekableReadBuffer::peekNext() void PeekableReadBuffer::setCheckpoint() { + checkStateCorrect(); +#ifdef NDEBUG + if (!checkpoint) + throw DB::Exception("Does not support recursive checkpoints.", ErrorCodes::LOGICAL_ERROR); +#endif checkpoint_in_own_memory = currentlyReadFromOwnMemory(); if (!checkpoint_in_own_memory) { @@ -90,10 +106,16 @@ void PeekableReadBuffer::setCheckpoint() peeked_size = 0; } checkpoint = pos; + checkStateCorrect(); } void PeekableReadBuffer::dropCheckpoint() { + checkStateCorrect(); +#ifdef NDEBUG + if (!checkpoint) + throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR); +#endif if (!currentlyReadFromOwnMemory()) { /// Don't need to store unread data anymore @@ -101,16 +123,19 @@ void PeekableReadBuffer::dropCheckpoint() } checkpoint = nullptr; checkpoint_in_own_memory = false; + checkStateCorrect(); } void PeekableReadBuffer::rollbackToCheckpoint() { + checkStateCorrect(); if (!checkpoint) throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR); else if (checkpointInOwnMemory() == currentlyReadFromOwnMemory()) pos = checkpoint; else /// Checkpoint is in own memory and pos is not. Switch to reading from own memory BufferBase::set(memory.data(), peeked_size, checkpoint - memory.data()); + checkStateCorrect(); } bool PeekableReadBuffer::nextImpl() @@ -126,6 +151,7 @@ bool PeekableReadBuffer::nextImpl() { /// All copied data have been read from own memory, continue reading from sub_buf peeked_size = 0; + res = sub_buf.hasPendingData() || sub_buf.next(); } else { @@ -140,7 +166,9 @@ bool PeekableReadBuffer::nextImpl() } else { - if (!currentlyReadFromOwnMemory()) + if (currentlyReadFromOwnMemory()) + res = sub_buf.hasPendingData() || sub_buf.next(); + else res = peekNext(); Buffer & sub_working = sub_buf.buffer(); BufferBase::set(sub_working.begin(), sub_working.size(), 0); @@ -157,6 +185,7 @@ bool PeekableReadBuffer::useSubbufferOnly() const void PeekableReadBuffer::checkStateCorrect() const { +#ifdef NDEBUG if (checkpoint) { if (checkpointInOwnMemory()) @@ -184,6 +213,9 @@ void PeekableReadBuffer::checkStateCorrect() const } if (currentlyReadFromOwnMemory() && !peeked_size) throw DB::Exception("Pos in empty own buffer", ErrorCodes::LOGICAL_ERROR); + if (unread_limit < memory.size()) + throw DB::Exception("Size limit exceed", ErrorCodes::LOGICAL_ERROR); +#endif } size_t PeekableReadBuffer::resizeOwnMemoryIfNecessary(size_t bytes_to_append) @@ -218,12 +250,16 @@ size_t PeekableReadBuffer::resizeOwnMemoryIfNecessary(size_t bytes_to_append) else { if (unread_limit < new_size) - throw DB::Exception("trying to peek too much data", ErrorCodes::MEMORY_LIMIT_EXCEEDED); + throw DB::Exception("PeekableReadBuffer: Memory limit exceed", ErrorCodes::MEMORY_LIMIT_EXCEEDED); size_t pos_offset = pos - memory.data(); - // TODO amortization - memory.resize(new_size); + size_t new_size_amortized = memory.size() * 2; + if (new_size_amortized < new_size) + new_size_amortized = new_size; + else if (unread_limit < new_size_amortized) + new_size_amortized = unread_limit; + memory.resize(new_size_amortized); if (needUpdateCheckpoint) checkpoint = memory.data() + offset; @@ -246,15 +282,18 @@ PeekableReadBuffer::~PeekableReadBuffer() std::shared_ptr> PeekableReadBuffer::takeUnreadData() { + checkStateCorrect(); if (!currentlyReadFromOwnMemory()) return std::make_shared>(0); size_t unread_size = memory.data() + peeked_size - pos; auto unread = std::make_shared>(unread_size); memcpy(unread->buffer().begin(), pos, unread_size); + unread->BufferBase::set(unread->buffer().begin(), unread_size, 0); peeked_size = 0; checkpoint = nullptr; checkpoint_in_own_memory = false; BufferBase::set(sub_buf.buffer().begin(), sub_buf.buffer().size(), sub_buf.offset()); + checkStateCorrect(); return unread; } @@ -271,8 +310,9 @@ bool PeekableReadBuffer::checkpointInOwnMemory() const void PeekableReadBuffer::assertCanBeDestructed() const { if (peeked_size && pos != memory.data() + peeked_size) - throw DB::Exception("There are data, which were extracted from sub-buffer, but not from peekable buffer: " - "Cannot destruct peekable buffer correctly because tha data will be lost", ErrorCodes::LOGICAL_ERROR); + throw DB::Exception("There are data, which were extracted from sub-buffer, but not from peekable buffer. " + "Cannot destruct peekable buffer correctly because tha data will be lost." + "Most likely it's a bug.", ErrorCodes::LOGICAL_ERROR); } } diff --git a/dbms/src/IO/PeekableReadBuffer.h b/dbms/src/IO/PeekableReadBuffer.h index e6079c4f33e..30a38b69e5c 100644 --- a/dbms/src/IO/PeekableReadBuffer.h +++ b/dbms/src/IO/PeekableReadBuffer.h @@ -19,10 +19,10 @@ namespace ErrorCodes /// (e.g. by istr.position() = prev_pos), behavior is undefined. class PeekableReadBuffer : public BufferWithOwnMemory { + friend class PeekableReadBufferCheckpoint; public: - constexpr static size_t default_limit = 32 * DBMS_DEFAULT_BUFFER_SIZE; - - explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t unread_limit_ = default_limit); + explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ = DBMS_DEFAULT_BUFFER_SIZE, + size_t unread_limit_ = 16 * DBMS_DEFAULT_BUFFER_SIZE); /// Use takeUnreadData() to extract unread data before destruct object ~PeekableReadBuffer() override; @@ -59,7 +59,6 @@ private: inline bool currentlyReadFromOwnMemory() const; inline bool checkpointInOwnMemory() const; - // TODO add unit test for PeekableReadBuffer and remove this method void checkStateCorrect() const; /// Makes possible to append `bytes_to_append` bytes to data in own memory. @@ -79,9 +78,18 @@ private: class PeekableReadBufferCheckpoint : boost::noncopyable { PeekableReadBuffer & buf; + bool auto_rollback; public: - explicit PeekableReadBufferCheckpoint(PeekableReadBuffer & buf_) : buf(buf_) { buf.setCheckpoint(); } - ~PeekableReadBufferCheckpoint() { buf.dropCheckpoint(); } + explicit PeekableReadBufferCheckpoint(PeekableReadBuffer & buf_, bool auto_rollback_ = false) + : buf(buf_), auto_rollback(auto_rollback_) { buf.setCheckpoint(); } + ~PeekableReadBufferCheckpoint() + { + if (!buf.checkpoint) + return; + if (auto_rollback) + buf.rollbackToCheckpoint(); + buf.dropCheckpoint(); + } }; diff --git a/dbms/src/IO/tests/CMakeLists.txt b/dbms/src/IO/tests/CMakeLists.txt index 2c3dc307b18..9ba5659449a 100644 --- a/dbms/src/IO/tests/CMakeLists.txt +++ b/dbms/src/IO/tests/CMakeLists.txt @@ -77,6 +77,9 @@ target_link_libraries (limit_read_buffer2 PRIVATE clickhouse_common_io) add_executable (parse_date_time_best_effort parse_date_time_best_effort.cpp) target_link_libraries (parse_date_time_best_effort PRIVATE clickhouse_common_io) +add_executable (peekable_read_buffer peekable_read_buffer.cpp) +target_link_libraries (peekable_read_buffer PRIVATE clickhouse_common_io) + add_executable (zlib_ng_bug zlib_ng_bug.cpp) target_link_libraries (zlib_ng_bug PRIVATE ${Poco_Foundation_LIBRARY}) if(NOT USE_INTERNAL_POCO_LIBRARY) diff --git a/dbms/src/IO/tests/peekable_read_buffer.cpp b/dbms/src/IO/tests/peekable_read_buffer.cpp new file mode 100644 index 00000000000..eca0b06fe3a --- /dev/null +++ b/dbms/src/IO/tests/peekable_read_buffer.cpp @@ -0,0 +1,133 @@ +#include +#include +#include +#include +#include + + +void readAndAssert(DB::ReadBuffer & buf, const char * str) +{ + size_t n = strlen(str); + char tmp[n]; + buf.readStrict(tmp, n); + assert(strncmp(tmp, str, n) == 0); +} + +void assertAvailable(DB::ReadBuffer & buf, const char * str) +{ + size_t n = strlen(str); + assert(buf.available() == n); + assert(strncmp(buf.position(), str, n) == 0); +} + +int main(int, char **) +{ + try + { + std::string s1 = "0123456789"; + std::string s2 = "qwertyuiop"; + std::string s3 = "asdfghjkl;"; + std::string s4 = "zxcvbnm,./"; + DB::ReadBufferFromString b1(s1); + DB::ReadBufferFromString b2(s2); + DB::ReadBufferFromString b3(s3); + DB::ReadBufferFromString b4(s4); + + DB::ConcatReadBuffer concat({&b1, &b2, &b3, &b4}); + DB::PeekableReadBuffer peekable(concat, 0, 16); + + assert(!peekable.eof()); + assertAvailable(peekable, "0123456789"); + { + DB::PeekableReadBufferCheckpoint checkpoint{peekable}; + readAndAssert(peekable, "01234"); + } + bool exception = false; + try + { + peekable.rollbackToCheckpoint(); + } + catch (DB::Exception & e) + { + if (e.code() != DB::ErrorCodes::LOGICAL_ERROR) + throw; + exception = true; + } + assert(exception); + assertAvailable(peekable, "56789"); + + readAndAssert(peekable, "56"); + + peekable.setCheckpoint(); + readAndAssert(peekable, "789qwertyu"); + peekable.rollbackToCheckpoint(); + peekable.dropCheckpoint(); + assertAvailable(peekable, "789"); + peekable.peekNext(); + assertAvailable(peekable, "789qwertyuiop"); + assert(peekable.lastPeeked().size() == 10); + assert(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10) == 0); + + exception = false; + try + { + DB::PeekableReadBufferCheckpoint checkpoint{peekable, true}; + peekable.ignore(30); + } + catch (DB::Exception & e) + { + if (e.code() != DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED) + throw; + exception = true; + } + assert(exception); + assertAvailable(peekable, "789qwertyuiop"); + assert(peekable.lastPeeked().size() == 10); + assert(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10) == 0); + + readAndAssert(peekable, "789qwertyu"); + peekable.setCheckpoint(); + readAndAssert(peekable, "iopasdfghj"); + assertAvailable(peekable, "kl;"); + peekable.dropCheckpoint(); + + peekable.setCheckpoint(); + readAndAssert(peekable, "kl;zxcvbnm,./"); + assert(peekable.eof()); + assert(peekable.eof()); + assert(peekable.eof()); + peekable.rollbackToCheckpoint(); + readAndAssert(peekable, "kl;zxcvbnm"); + peekable.dropCheckpoint(); + + exception = false; + try + { + peekable.assertCanBeDestructed(); + } + catch (DB::Exception & e) + { + if (e.code() != DB::ErrorCodes::LOGICAL_ERROR) + throw; + exception = true; + } + assert(exception); + + auto buf_ptr = peekable.takeUnreadData(); + assert(peekable.eof()); + assert(peekable.eof()); + assert(peekable.eof()); + + readAndAssert(*buf_ptr, ",./"); + assert(buf_ptr->eof()); + + peekable.assertCanBeDestructed(); + } + catch (const DB::Exception & e) + { + std::cerr << e.what() << ", " << e.displayText() << std::endl; + return 1; + } + + return 0; +} From d95d53b4e49754e64ccf2e107981043fe99a2762 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 29 Aug 2019 22:29:54 +0300 Subject: [PATCH 27/43] better error messages --- dbms/src/IO/PeekableReadBuffer.cpp | 8 +- dbms/src/IO/tests/peekable_read_buffer.cpp | 41 ++-- .../Parsers/ParsedTemplateFormatString.cpp | 217 ++++++++++++++++++ dbms/src/Parsers/ParsedTemplateFormatString.h | 51 ++++ .../Impl/TemplateBlockOutputFormat.cpp | 195 ++-------------- .../Formats/Impl/TemplateBlockOutputFormat.h | 32 +-- .../Formats/Impl/TemplateRowInputFormat.cpp | 56 +++-- .../Formats/Impl/TemplateRowInputFormat.h | 2 +- 8 files changed, 359 insertions(+), 243 deletions(-) create mode 100644 dbms/src/Parsers/ParsedTemplateFormatString.cpp create mode 100644 dbms/src/Parsers/ParsedTemplateFormatString.h diff --git a/dbms/src/IO/PeekableReadBuffer.cpp b/dbms/src/IO/PeekableReadBuffer.cpp index 0b6b1f5805c..dde9ec35aff 100644 --- a/dbms/src/IO/PeekableReadBuffer.cpp +++ b/dbms/src/IO/PeekableReadBuffer.cpp @@ -95,8 +95,8 @@ bool PeekableReadBuffer::peekNext() void PeekableReadBuffer::setCheckpoint() { checkStateCorrect(); -#ifdef NDEBUG - if (!checkpoint) +#ifndef NDEBUG + if (checkpoint) throw DB::Exception("Does not support recursive checkpoints.", ErrorCodes::LOGICAL_ERROR); #endif checkpoint_in_own_memory = currentlyReadFromOwnMemory(); @@ -112,7 +112,7 @@ void PeekableReadBuffer::setCheckpoint() void PeekableReadBuffer::dropCheckpoint() { checkStateCorrect(); -#ifdef NDEBUG +#ifndef NDEBUG if (!checkpoint) throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR); #endif @@ -185,7 +185,7 @@ bool PeekableReadBuffer::useSubbufferOnly() const void PeekableReadBuffer::checkStateCorrect() const { -#ifdef NDEBUG +#ifndef NDEBUG if (checkpoint) { if (checkpointInOwnMemory()) diff --git a/dbms/src/IO/tests/peekable_read_buffer.cpp b/dbms/src/IO/tests/peekable_read_buffer.cpp index eca0b06fe3a..d9149f5377d 100644 --- a/dbms/src/IO/tests/peekable_read_buffer.cpp +++ b/dbms/src/IO/tests/peekable_read_buffer.cpp @@ -4,20 +4,25 @@ #include #include +void assertTrue(bool b) +{ + if (!b) + throw DB::Exception("assert failed", DB::ErrorCodes::LOGICAL_ERROR); +} void readAndAssert(DB::ReadBuffer & buf, const char * str) { size_t n = strlen(str); char tmp[n]; buf.readStrict(tmp, n); - assert(strncmp(tmp, str, n) == 0); + assertTrue(strncmp(tmp, str, n) == 0); } void assertAvailable(DB::ReadBuffer & buf, const char * str) { size_t n = strlen(str); - assert(buf.available() == n); - assert(strncmp(buf.position(), str, n) == 0); + assertTrue(buf.available() == n); + assertTrue(strncmp(buf.position(), str, n) == 0); } int main(int, char **) @@ -36,7 +41,7 @@ int main(int, char **) DB::ConcatReadBuffer concat({&b1, &b2, &b3, &b4}); DB::PeekableReadBuffer peekable(concat, 0, 16); - assert(!peekable.eof()); + assertTrue(!peekable.eof()); assertAvailable(peekable, "0123456789"); { DB::PeekableReadBufferCheckpoint checkpoint{peekable}; @@ -53,7 +58,7 @@ int main(int, char **) throw; exception = true; } - assert(exception); + assertTrue(exception); assertAvailable(peekable, "56789"); readAndAssert(peekable, "56"); @@ -65,8 +70,8 @@ int main(int, char **) assertAvailable(peekable, "789"); peekable.peekNext(); assertAvailable(peekable, "789qwertyuiop"); - assert(peekable.lastPeeked().size() == 10); - assert(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10) == 0); + assertTrue(peekable.lastPeeked().size() == 10); + assertTrue(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10) == 0); exception = false; try @@ -80,10 +85,10 @@ int main(int, char **) throw; exception = true; } - assert(exception); + assertTrue(exception); assertAvailable(peekable, "789qwertyuiop"); - assert(peekable.lastPeeked().size() == 10); - assert(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10) == 0); + assertTrue(peekable.lastPeeked().size() == 10); + assertTrue(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10) == 0); readAndAssert(peekable, "789qwertyu"); peekable.setCheckpoint(); @@ -93,9 +98,9 @@ int main(int, char **) peekable.setCheckpoint(); readAndAssert(peekable, "kl;zxcvbnm,./"); - assert(peekable.eof()); - assert(peekable.eof()); - assert(peekable.eof()); + assertTrue(peekable.eof()); + assertTrue(peekable.eof()); + assertTrue(peekable.eof()); peekable.rollbackToCheckpoint(); readAndAssert(peekable, "kl;zxcvbnm"); peekable.dropCheckpoint(); @@ -111,15 +116,15 @@ int main(int, char **) throw; exception = true; } - assert(exception); + assertTrue(exception); auto buf_ptr = peekable.takeUnreadData(); - assert(peekable.eof()); - assert(peekable.eof()); - assert(peekable.eof()); + assertTrue(peekable.eof()); + assertTrue(peekable.eof()); + assertTrue(peekable.eof()); readAndAssert(*buf_ptr, ",./"); - assert(buf_ptr->eof()); + assertTrue(buf_ptr->eof()); peekable.assertCanBeDestructed(); } diff --git a/dbms/src/Parsers/ParsedTemplateFormatString.cpp b/dbms/src/Parsers/ParsedTemplateFormatString.cpp new file mode 100644 index 00000000000..2fbbb8965fd --- /dev/null +++ b/dbms/src/Parsers/ParsedTemplateFormatString.cpp @@ -0,0 +1,217 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_TEMPLATE_FORMAT; +} + +ParsedTemplateFormatString::ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idx_by_name) +{ + try + { + parse(format_string, idx_by_name); + } + catch (DB::Exception & e) + { + if (e.code() != ErrorCodes::INVALID_TEMPLATE_FORMAT) + throwInvalidFormat(e.message(), columnsCount()); + else + throw; + } +} + + +void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name) +{ + enum ParserState + { + Delimiter, + Column, + Format + }; + + const char * pos = format_string.c_str(); + const char * end = format_string.c_str() + format_string.size(); + const char * token_begin = pos; + ParserState state = Delimiter; + delimiters.emplace_back(); + for (; *pos; ++pos) + { + switch (state) + { + case Delimiter: + if (*pos == '$') + { + delimiters.back().append(token_begin, pos - token_begin); + ++pos; + if (*pos == '{') + { + token_begin = pos + 1; + state = Column; + } + else if (*pos == '$') + { + token_begin = pos; + } + else + throwInvalidFormat("at pos " + std::to_string(pos - format_string.c_str()) + + ": expected '{' or '$' after '$'", columnsCount()); + } + break; + + case Column: + column_names.emplace_back(); + pos = readMayBeQuotedColumnNameInto(pos, end - pos, column_names.back()); + + if (*pos == ':') + state = Format; + else if (*pos == '}') + { + formats.push_back(ColumnFormat::None); + delimiters.emplace_back(); + state = Delimiter; + } + else + throwInvalidFormat("Expected ':' or '}' after column name: \"" + column_names.back() + "\"", columnsCount()); + + token_begin = pos + 1; + format_idx_to_column_idx.emplace_back(idx_by_name(column_names.back())); + break; + + case Format: + if (*pos == '}') + { + formats.push_back(stringToFormat(String(token_begin, pos - token_begin))); + token_begin = pos + 1; + delimiters.emplace_back(); + state = Delimiter; + } + } + } + if (state != Delimiter) + throwInvalidFormat("Unbalanced parentheses", columnsCount()); + delimiters.back().append(token_begin, pos - token_begin); +} + + +ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format) const +{ + if (col_format.empty()) + return ColumnFormat::None; + else if (col_format == "None") + return ColumnFormat::None; + else if (col_format == "Escaped") + return ColumnFormat::Escaped; + else if (col_format == "Quoted") + return ColumnFormat::Quoted; + else if (col_format == "CSV") + return ColumnFormat::Csv; + else if (col_format == "JSON") + return ColumnFormat::Json; + else if (col_format == "XML") + return ColumnFormat::Xml; + else if (col_format == "Raw") + return ColumnFormat::Raw; + else + throwInvalidFormat("Unknown field format " + col_format, columnsCount()); +} + +size_t ParsedTemplateFormatString::columnsCount() const +{ + return format_idx_to_column_idx.size(); +} + +String ParsedTemplateFormatString::formatToString(ParsedTemplateFormatString::ColumnFormat format) +{ + switch (format) + { + case ColumnFormat::None: + return "None"; + case ColumnFormat::Escaped: + return "Escaped"; + case ColumnFormat::Quoted: + return "Quoted"; + case ColumnFormat::Csv: + return "CSV"; + case ColumnFormat::Json: + return "Json"; + case ColumnFormat::Xml: + return "Xml"; + case ColumnFormat::Raw: + return "Raw"; + } + __builtin_unreachable(); +} + +const char * ParsedTemplateFormatString::readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s) +{ + s.clear(); + if (!size) + return pos; + ReadBufferFromMemory buf{pos, size}; + if (*pos == '"') + readDoubleQuotedStringWithSQLStyle(s, buf); + else if (*pos == '`') + readBackQuotedStringWithSQLStyle(s, buf); + else if (isWordCharASCII(*pos)) + { + size_t name_size = 1; + while (name_size < size && isWordCharASCII(*(pos + name_size))) + ++name_size; + s = String{pos, name_size}; + return pos + name_size; + } + return pos + buf.count(); +} + +String ParsedTemplateFormatString::dump() const +{ + WriteBufferFromOwnString res; + res << "Delimiter " << 0 << ": "; + verbosePrintString(delimiters.front().c_str(), delimiters.front().c_str() + delimiters.front().size(), res); + + size_t num_columns = std::max(formats.size(), format_idx_to_column_idx.size()); + for (size_t i = 0; i < num_columns; ++i) + { + res << "\nColumn " << i << ": \""; + if (column_names.size() <= i) + res << ""; + else if (column_names[i].empty()) + res << ""; + else + res << column_names[i]; + + res << "\" (mapped to table column "; + if (format_idx_to_column_idx.size() <= i) + res << ""; + else if (!format_idx_to_column_idx[i]) + res << ""; + else + res << *format_idx_to_column_idx[i]; + + res << "), Format " << (i < formats.size() ? formatToString(formats[i]) : ""); + + res << "\nDelimiter " << i + 1 << ": "; + if (delimiters.size() <= i + 1) + res << ""; + else + verbosePrintString(delimiters[i + 1].c_str(), delimiters[i + 1].c_str() + delimiters[i + 1].size(), res); + } + + return res.str(); +} + +void ParsedTemplateFormatString::throwInvalidFormat(const String & message, size_t column) const +{ + throw Exception("Invalid format string for Template: " + message + " (near column " + std::to_string(column) + + ")" + ". Parsed format string:\n" + dump() + "\n", + ErrorCodes::INVALID_TEMPLATE_FORMAT); +} + +} diff --git a/dbms/src/Parsers/ParsedTemplateFormatString.h b/dbms/src/Parsers/ParsedTemplateFormatString.h new file mode 100644 index 00000000000..5353f336f64 --- /dev/null +++ b/dbms/src/Parsers/ParsedTemplateFormatString.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +struct ParsedTemplateFormatString +{ + enum class ColumnFormat + { + None, + Escaped, + Quoted, + Csv, + Json, + Xml, + Raw + }; + + /// Format string has syntax: "Delimiter0 ${ColumnName0:Format0} Delimiter1 ${ColumnName1:Format1} Delimiter2" + /// The following vectors is filled with corresponding values, delimiters.size() - 1 = formats.size() = format_idx_to_column_idx.size() + /// If format_idx_to_column_idx[i] has no value, then TemplateRowInputFormat will skip i-th column. + + std::vector delimiters; + std::vector formats; + std::vector> format_idx_to_column_idx; + + /// For diagnostic info + Strings column_names; + + typedef std::function(const String &)> ColumnIdxGetter; + + ParsedTemplateFormatString() = default; + ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idx_by_name); + + void parse(const String & format_string, const ColumnIdxGetter & idx_by_name); + + ColumnFormat stringToFormat(const String & format) const; + static String formatToString(ColumnFormat format); + static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s); + size_t columnsCount() const; + + String dump() const; + [[noreturn]] void throwInvalidFormat(const String & message, size_t column) const; +}; + +} + diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index fb2b96a561e..d16b9ab53dd 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -1,10 +1,7 @@ #include #include -#include -#include #include #include -#include namespace DB @@ -12,157 +9,9 @@ namespace DB namespace ErrorCodes { - extern const int INVALID_TEMPLATE_FORMAT; + extern const int SYNTAX_ERROR; } -ParsedTemplateFormatString::ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idxByName) -{ - enum ParserState - { - Delimiter, - Column, - Format - }; - - const char * pos = format_string.c_str(); - const char * end = format_string.c_str() + format_string.size(); - const char * token_begin = pos; - String column_name; - ParserState state = Delimiter; - delimiters.emplace_back(); - for (; *pos; ++pos) - { - switch (state) - { - case Delimiter: - if (*pos == '$') - { - delimiters.back().append(token_begin, pos - token_begin); - ++pos; - if (*pos == '{') - { - token_begin = pos + 1; - state = Column; - } - else if (*pos == '$') - { - token_begin = pos; - } - else - { - throw Exception("Invalid template format string: pos " + std::to_string(pos - format_string.c_str()) + - ": expected '{' or '$' after '$'", ErrorCodes::INVALID_TEMPLATE_FORMAT); - } - } - break; - - case Column: - pos = readMayBeQuotedColumnNameInto(pos, end - pos, column_name); - - if (*pos == ':') - state = Format; - else if (*pos == '}') - { - formats.push_back(ColumnFormat::None); - delimiters.emplace_back(); - state = Delimiter; - } - else - throw Exception("Invalid template format string: Expected ':' or '}' after column name: \"" + column_name + "\"", - ErrorCodes::INVALID_TEMPLATE_FORMAT); - - token_begin = pos + 1; - format_idx_to_column_idx.emplace_back(idxByName(column_name)); - break; - - case Format: - if (*pos == '}') - { - formats.push_back(stringToFormat(String(token_begin, pos - token_begin))); - token_begin = pos + 1; - delimiters.emplace_back(); - state = Delimiter; - } - } - } - if (state != Delimiter) - throw Exception("Invalid template format string: check parentheses balance", ErrorCodes::INVALID_TEMPLATE_FORMAT); - delimiters.back().append(token_begin, pos - token_begin); -} - - -ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format) -{ - if (col_format.empty()) - return ColumnFormat::None; - else if (col_format == "None") - return ColumnFormat::None; - else if (col_format == "Escaped") - return ColumnFormat::Escaped; - else if (col_format == "Quoted") - return ColumnFormat::Quoted; - else if (col_format == "CSV") - return ColumnFormat::Csv; - else if (col_format == "JSON") - return ColumnFormat::Json; - else if (col_format == "XML") - return ColumnFormat::Xml; - else if (col_format == "Raw") - return ColumnFormat::Raw; - else - throw Exception("Invalid template format string: unknown field format " + col_format, - ErrorCodes::INVALID_TEMPLATE_FORMAT); -} - -size_t ParsedTemplateFormatString::columnsCount() const -{ - return format_idx_to_column_idx.size(); -} - -String ParsedTemplateFormatString::formatToString(ParsedTemplateFormatString::ColumnFormat format) -{ - switch (format) - { - case ColumnFormat::None: - return "None"; - case ColumnFormat::Escaped: - return "Escaped"; - case ColumnFormat::Quoted: - return "Quoted"; - case ColumnFormat::Csv: - return "CSV"; - case ColumnFormat::Json: - return "Json"; - case ColumnFormat::Xml: - return "Xml"; - case ColumnFormat::Raw: - return "Raw"; - } - __builtin_unreachable(); -} - -const char * ParsedTemplateFormatString::readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s) -{ - s.clear(); - if (!size) - return pos; - ReadBufferFromMemory buf{pos, size}; - if (*pos == '"') - readDoubleQuotedStringWithSQLStyle(s, buf); - else if (*pos == '`') - readBackQuotedStringWithSQLStyle(s, buf); - else if (isWordCharASCII(*pos)) - { - size_t name_size = 1; - while (name_size < size && isWordCharASCII(*(pos + name_size))) - ++name_size; - s = String{pos, name_size}; - return pos + name_size; - } - return pos + buf.count(); -} - - TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & settings_) : IOutputFormat(header_, out_), settings(settings_) { @@ -185,7 +34,7 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const B for (size_t i = 0; i < format.format_idx_to_column_idx.size(); ++i) { if (!format.format_idx_to_column_idx[i]) - throw Exception("Output part name cannot be empty, it's a bug.", ErrorCodes::LOGICAL_ERROR); + format.throwInvalidFormat("Output part name cannot be empty, it's a bug.", i); switch (static_cast(*format.format_idx_to_column_idx[i])) { case OutputPart::Data: @@ -195,17 +44,17 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const B case OutputPart::ExtremesMin: case OutputPart::ExtremesMax: if (format.formats[i] != ColumnFormat::None) - throw Exception("invalid template: wrong serialization type for data, totals, min or max", - ErrorCodes::INVALID_TEMPLATE_FORMAT); + format.throwInvalidFormat("Serialization type for data, totals, min and max must be empty or None", i); break; default: if (format.formats[i] == ColumnFormat::None) - throw Exception("Serialization type for output part rows, rows_before_limit, time, rows_read or bytes_read not specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); + format.throwInvalidFormat("Serialization type for output part rows, rows_before_limit, time, " + "rows_read or bytes_read is not specified", i); break; } } if (data_idx != 0) - throw Exception("invalid template: ${data} must be the first output part", ErrorCodes::INVALID_TEMPLATE_FORMAT); + format.throwInvalidFormat("${data} must be the first output part", 0); /// Parse format string for rows row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) @@ -215,13 +64,13 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const B /// Validate format string for rows if (row_format.delimiters.size() == 1) - throw Exception("invalid template: no columns specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); + row_format.throwInvalidFormat("No columns specified", 0); for (size_t i = 0; i < row_format.columnsCount(); ++i) { if (!row_format.format_idx_to_column_idx[i]) - throw Exception("Cannot skip format field for output, it's a bug.", ErrorCodes::LOGICAL_ERROR); + row_format.throwInvalidFormat("Cannot skip format field for output, it's a bug.", i); if (row_format.formats[i] == ColumnFormat::None) - throw Exception("Serialization type for file column " + std::to_string(i) + " not specified", ErrorCodes::INVALID_TEMPLATE_FORMAT); + row_format.throwInvalidFormat("Serialization type for file column is not specified", i); } } @@ -246,7 +95,7 @@ TemplateBlockOutputFormat::OutputPart TemplateBlockOutputFormat::stringToOutputP else if (part == "bytes_read") return OutputPart::BytesRead; else - throw Exception("invalid template: unknown output part " + part, ErrorCodes::INVALID_TEMPLATE_FORMAT); + throw Exception("Unknown output part " + part, ErrorCodes::SYNTAX_ERROR); } void TemplateBlockOutputFormat::writeRow(const Chunk & chunk, size_t row_num) @@ -331,48 +180,48 @@ void TemplateBlockOutputFormat::finalize() size_t parts = format.format_idx_to_column_idx.size(); - for (size_t j = 0; j < parts; ++j) + for (size_t i = 0; i < parts; ++i) { auto type = std::make_shared(); ColumnWithTypeAndName col(type->createColumnConst(1, row_count), type, String("tmp")); - switch (static_cast(*format.format_idx_to_column_idx[j])) + switch (static_cast(*format.format_idx_to_column_idx[i])) { case OutputPart::Totals: if (!totals) - throw Exception("invalid template: cannot print totals for this request", ErrorCodes::INVALID_TEMPLATE_FORMAT); + format.throwInvalidFormat("Cannot print totals for this request", i); writeRow(totals, 0); break; case OutputPart::ExtremesMin: if (!extremes) - throw Exception("invalid template: cannot print extremes for this request", ErrorCodes::INVALID_TEMPLATE_FORMAT); + format.throwInvalidFormat("Cannot print extremes for this request", i); writeRow(extremes, 0); break; case OutputPart::ExtremesMax: if (!extremes) - throw Exception("invalid template: cannot print extremes for this request", ErrorCodes::INVALID_TEMPLATE_FORMAT); + format.throwInvalidFormat("Cannot print extremes for this request", i); writeRow(extremes, 1); break; case OutputPart::Rows: - writeValue(row_count, format.formats[j]); + writeValue(row_count, format.formats[i]); break; case OutputPart::RowsBeforeLimit: if (!rows_before_limit_set) - throw Exception("invalid template: cannot print rows_before_limit for this request", ErrorCodes::INVALID_TEMPLATE_FORMAT); - writeValue(rows_before_limit, format.formats[j]); + format.throwInvalidFormat("Cannot print rows_before_limit for this request", i); + writeValue(rows_before_limit, format.formats[i]); break; case OutputPart::TimeElapsed: - writeValue(watch.elapsedSeconds(), format.formats[j]); + writeValue(watch.elapsedSeconds(), format.formats[i]); break; case OutputPart::RowsRead: - writeValue(progress.read_rows.load(), format.formats[j]); + writeValue(progress.read_rows.load(), format.formats[i]); break; case OutputPart::BytesRead: - writeValue(progress.read_bytes.load(), format.formats[j]); + writeValue(progress.read_bytes.load(), format.formats[i]); break; default: break; } - writeString(format.delimiters[j + 1], out); + writeString(format.delimiters[i + 1], out); } finalized = true; diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h index 1f0e2b1cf58..844595b422d 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h @@ -4,42 +4,12 @@ #include #include #include +#include namespace DB { -struct ParsedTemplateFormatString -{ - enum class ColumnFormat - { - None, - Escaped, - Quoted, - Csv, - Json, - Xml, - Raw - }; - - /// Format string has syntax: "Delimiter0 ${ColumnName0:Format0} Delimiter1 ${ColumnName1:Format1} Delimiter2" - /// The following vectors is filled with corresponding values, delimiters.size() - 1 = formats.size() = format_idx_to_column_idx.size() - /// If format_idx_to_column_idx[i] has no value, then TemplateRowInputStream will skip i-th column. - - std::vector delimiters; - std::vector formats; - std::vector> format_idx_to_column_idx; - - typedef std::function(const String &)> ColumnIdxGetter; - - ParsedTemplateFormatString() = default; - ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idxByName); - static ColumnFormat stringToFormat(const String & format); - static String formatToString(ColumnFormat format); - static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s); - size_t columnsCount() const; -}; - class TemplateBlockOutputFormat : public IOutputFormat { using ColumnFormat = ParsedTemplateFormatString::ColumnFormat; diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index a0e28e886a1..9ef9152d88d 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -9,11 +9,11 @@ namespace DB namespace ErrorCodes { -extern const int INVALID_TEMPLATE_FORMAT; extern const int ATTEMPT_TO_READ_AFTER_EOF; extern const int CANNOT_READ_ALL_DATA; extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; extern const int CANNOT_PARSE_QUOTED_STRING; +extern const int SYNTAX_ERROR; } @@ -30,8 +30,8 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h if (partName == "data") return 0; else if (partName.empty()) /// For skipping some values in prefix and suffix - return {}; - throw Exception("invalid template format: unknown input part " + partName, ErrorCodes::INVALID_TEMPLATE_FORMAT); + return std::optional(); + throw Exception("Unknown input part " + partName, ErrorCodes::SYNTAX_ERROR); }); /// Validate format string for whole input @@ -41,16 +41,16 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h if (format.format_idx_to_column_idx[i]) { if (has_data) - throw Exception("${data} can occur only once", ErrorCodes::INVALID_TEMPLATE_FORMAT); + format.throwInvalidFormat("${data} can occur only once", i); if (format.formats[i] != ColumnFormat::None) - throw Exception("invalid template format: ${data} must have empty or None serialization type", ErrorCodes::INVALID_TEMPLATE_FORMAT); + format.throwInvalidFormat("${data} must have empty or None deserialization type", i); has_data = true; format_data_idx = i; } else { if (format.formats[i] == ColumnFormat::Xml || format.formats[i] == ColumnFormat::Raw) - throw Exception("None, XML and Raw deserialization is not supported", ErrorCodes::INVALID_TEMPLATE_FORMAT); + format.throwInvalidFormat("XML and Raw deserialization is not supported", i); } } @@ -58,7 +58,7 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) -> std::optional { if (colName.empty()) - return {}; + return std::optional(); return header_.getPositionByName(colName); }); @@ -67,17 +67,16 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h for (size_t i = 0; i < row_format.columnsCount(); ++i) { if (row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw) - throw Exception("invalid template format: None, XML and Raw deserialization is not supported", ErrorCodes::INVALID_TEMPLATE_FORMAT); + row_format.throwInvalidFormat("XML and Raw deserialization is not supported", i); if (row_format.format_idx_to_column_idx[i]) { if (row_format.formats[i] == ColumnFormat::None) - throw Exception("invalid template format: None, XML and Raw deserialization is not supported", ErrorCodes::INVALID_TEMPLATE_FORMAT); + row_format.throwInvalidFormat("Column is not skipped, but deserialization type is None", i); size_t col_idx = *row_format.format_idx_to_column_idx[i]; if (column_in_format[col_idx]) - throw Exception("invalid template format: duplicate column " + header_.getColumnsWithTypeAndName()[col_idx].name, - ErrorCodes::INVALID_TEMPLATE_FORMAT); + row_format.throwInvalidFormat("Duplicate column", i); column_in_format[col_idx] = true; } } @@ -85,14 +84,22 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h void TemplateRowInputFormat::readPrefix() { - tryReadPrefixOrSuffix(0, format_data_idx); + size_t last_successfully_parsed_idx = 0; + try + { + tryReadPrefixOrSuffix(last_successfully_parsed_idx, format_data_idx); + } + catch (Exception & e) + { + format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx); + } } /// Asserts delimiters and skips fields in prefix or suffix. /// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row /// (most likely false will be returned on first call of checkString(...)) template -ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t input_part_beg, size_t input_part_end) +ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end) { static constexpr bool throw_exception = std::is_same_v; @@ -261,9 +268,10 @@ bool TemplateRowInputFormat::checkForSuffix() { PeekableReadBufferCheckpoint checkpoint{buf}; bool suffix_found = false; + size_t last_successfully_parsed_idx = format_data_idx + 1; try { - suffix_found = tryReadPrefixOrSuffix(format_data_idx + 1, format.columnsCount()); + suffix_found = tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); } catch (const Exception & e) { @@ -273,8 +281,6 @@ bool TemplateRowInputFormat::checkForSuffix() throw; } - /// TODO better diagnostic in case of invalid suffix - if (unlikely(suffix_found)) { skipSpaces(); @@ -288,6 +294,24 @@ bool TemplateRowInputFormat::checkForSuffix() bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { + out << "Suffix does not match: "; + size_t last_successfully_parsed_idx = format_data_idx + 1; + bool catched = false; + try + { + tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); + } + catch (Exception & e) + { + out << e.message() << " Near column " << last_successfully_parsed_idx; + catched = true; + } + if (!catched) + out << " There is some data after suffix (EOF expected). "; + out << " Format string (from format_schema): \n" << format.dump() << "\n"; + out << "Trying to parse next row, because suffix does not match:\n"; + + out << "Using format string (from format_schema_rows): " << row_format.dump() << "\n"; try { if (likely(row_num != 1)) diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 408fcf26203..4932cbf5023 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -33,7 +33,7 @@ private: inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(buf); } template - ReturnType tryReadPrefixOrSuffix(size_t input_part_beg, size_t input_part_end); + ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end); bool checkForSuffix(); [[noreturn]] void throwUnexpectedEof(); From e3982704eaa6cf58275d06fad69d48dba2e45228 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 30 Aug 2019 00:46:36 +0300 Subject: [PATCH 28/43] try to fix build --- .../Formats/Impl/TemplateRowInputFormat.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 9ef9152d88d..300c7522efb 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -30,7 +30,15 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h if (partName == "data") return 0; else if (partName.empty()) /// For skipping some values in prefix and suffix - return std::optional(); +#if !__clang__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + /// Suppress false-positive warning (bug in GCC 9: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86465) + return {}; +#if !__clang__ +#pragma GCC diagnostic pop +#endif throw Exception("Unknown input part " + partName, ErrorCodes::SYNTAX_ERROR); }); @@ -58,7 +66,14 @@ TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & h row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) -> std::optional { if (colName.empty()) - return std::optional(); +#if !__clang__ + #pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + return {}; +#if !__clang__ +#pragma GCC diagnostic pop +#endif return header_.getPositionByName(colName); }); From 4c744af11392113d3ab82467fbb2e4d258ad33b0 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 30 Aug 2019 17:38:24 +0300 Subject: [PATCH 29/43] minor style fixes --- .../Formats/tests/tab_separated_streams.cpp | 2 +- dbms/src/IO/PeekableReadBuffer.cpp | 6 +-- .../Formats/Impl/CSVRowInputFormat.cpp | 16 +++---- .../Formats/Impl/CSVRowInputFormat.h | 11 +++-- .../Impl/TabSeparatedRowInputFormat.cpp | 42 +++++++++--------- .../Formats/Impl/TabSeparatedRowInputFormat.h | 11 ++--- .../Impl/TemplateBlockOutputFormat.cpp | 4 +- .../Formats/Impl/TemplateBlockOutputFormat.h | 2 +- .../Formats/Impl/TemplateRowInputFormat.cpp | 43 ++++++++++++------- .../Formats/Impl/TemplateRowInputFormat.h | 6 +-- .../RowInputFormatWithDiagnosticInfo.cpp | 11 ++--- .../RowInputFormatWithDiagnosticInfo.h | 9 ++-- 12 files changed, 84 insertions(+), 79 deletions(-) diff --git a/dbms/src/Formats/tests/tab_separated_streams.cpp b/dbms/src/Formats/tests/tab_separated_streams.cpp index 46999f6e594..f05a83bc751 100644 --- a/dbms/src/Formats/tests/tab_separated_streams.cpp +++ b/dbms/src/Formats/tests/tab_separated_streams.cpp @@ -41,7 +41,7 @@ try RowInputFormatParams params{DEFAULT_INSERT_BLOCK_SIZE, 0, 0, 0, []{}}; - InputFormatPtr input_format = std::make_shared(in_buf, sample, false, false, params, format_settings); + InputFormatPtr input_format = std::make_shared(sample, in_buf, params, false, false, format_settings); BlockInputStreamPtr block_input = std::make_shared(std::move(input_format)); BlockOutputStreamPtr block_output = std::make_shared( diff --git a/dbms/src/IO/PeekableReadBuffer.cpp b/dbms/src/IO/PeekableReadBuffer.cpp index dde9ec35aff..07c815931b3 100644 --- a/dbms/src/IO/PeekableReadBuffer.cpp +++ b/dbms/src/IO/PeekableReadBuffer.cpp @@ -15,7 +15,6 @@ PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ checkStateCorrect(); } -/// Saves unread data to own memory, so it will be possible to read it later. Loads next data to sub-buffer bool PeekableReadBuffer::peekNext() { checkStateCorrect(); @@ -29,7 +28,7 @@ bool PeekableReadBuffer::peekNext() if (checkpoint) copy_from = checkpoint; bytes_read = copy_from - sub_buf.buffer().begin(); - bytes_to_copy = sub_buf.buffer().end() - copy_from; // sub_buf.available(); + bytes_to_copy = sub_buf.buffer().end() - copy_from; /// sub_buf.available(); if (!bytes_to_copy) { bytes += bytes_read; @@ -143,7 +142,7 @@ bool PeekableReadBuffer::nextImpl() /// FIXME wrong bytes count because it can read the same data again after rollbackToCheckpoint() /// However, changing bytes count on every call of next() (even after rollback) allows to determine if some pointers were invalidated. checkStateCorrect(); - bool res = true; + bool res; if (!checkpoint) { @@ -207,7 +206,6 @@ void PeekableReadBuffer::checkStateCorrect() const } else { - if (!currentlyReadFromOwnMemory() && peeked_size) throw DB::Exception("Own buffer is not empty", ErrorCodes::LOGICAL_ERROR); } diff --git a/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 440487e758a..b5ee30fb7f8 100644 --- a/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -18,8 +18,8 @@ namespace ErrorCodes } -CSVRowInputFormat::CSVRowInputFormat( - ReadBuffer & in_, Block header_, Params params_, bool with_names_, const FormatSettings & format_settings_) +CSVRowInputFormat::CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, + bool with_names_, const FormatSettings & format_settings_) : RowInputFormatWithDiagnosticInfo(header_, in_, params_) , with_names(with_names_) , format_settings(format_settings_) @@ -356,16 +356,16 @@ void CSVRowInputFormat::syncAfterError() skipToNextLineOrEOF(in); } -void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) +void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, + ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) { skipWhitespacesAndTabs(in); prev_pos = in.position(); - if (column_indexes_for_input_fields[input_position]) + if (column_indexes_for_input_fields[file_column]) { - const bool is_last_file_column = input_position + 1 == column_indexes_for_input_fields.size(); - if (!readField(column, type, is_last_file_column, *column_indexes_for_input_fields[input_position])) + const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size(); + if (!readField(column, type, is_last_file_column, *column_indexes_for_input_fields[file_column])) column.insertDefault(); } else @@ -428,7 +428,7 @@ void registerInputFormatProcessorCSV(FormatFactory & factory) IRowInputFormat::Params params, const FormatSettings & settings) { - return std::make_shared(buf, sample, std::move(params), with_names, settings); + return std::make_shared(sample, buf, params, with_names, settings); }); } } diff --git a/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.h b/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.h index 6ca40425ebd..b8a3a956e1e 100644 --- a/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -11,8 +11,6 @@ namespace DB { -class ReadBuffer; - /** A stream for inputting data in csv format. * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. */ @@ -21,7 +19,8 @@ class CSVRowInputFormat : public RowInputFormatWithDiagnosticInfo public: /** with_names - in the first line the header with column names */ - CSVRowInputFormat(ReadBuffer & in_, Block header_, Params params_, bool with_names_, const FormatSettings & format_settings_); + CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, + bool with_names_, const FormatSettings & format_settings_); String getName() const override { return "CSVRowInputFormat"; } @@ -42,7 +41,7 @@ private: using OptionalIndexes = std::vector>; OptionalIndexes column_indexes_for_input_fields; - /// Tracks which colums we have read in a single read() call. + /// Tracks which columns we have read in a single read() call. /// For columns that are never read, it is initialized to false when we /// read the file header, and never changed afterwards. /// For other columns, it is updated on each read() call. @@ -55,8 +54,8 @@ private: void addInputColumn(const String & column_name); bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) override; + void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, + ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override; bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter; diff --git a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 602b29e08c5..cb9ff5b53be 100644 --- a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -42,9 +42,9 @@ static void checkForCarriageReturn(ReadBuffer & in) } -TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( - ReadBuffer & in_, Block header_, bool with_names_, bool with_types_, Params params_, const FormatSettings & format_settings_) - : RowInputFormatWithDiagnosticInfo(std::move(header_), in_, std::move(params_)), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) +TabSeparatedRowInputFormat::TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, + bool with_names_, bool with_types_, const FormatSettings & format_settings_) + : RowInputFormatWithDiagnosticInfo(header_, in_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) { auto & sample = getPort().getHeader(); size_t num_columns = sample.columns(); @@ -174,9 +174,9 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens updateDiagnosticInfo(); - for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position) + for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column) { - const auto & column_index = column_indexes_for_input_fields[input_position]; + const auto & column_index = column_indexes_for_input_fields[file_column]; if (column_index) { data_types[*column_index]->deserializeAsTextEscaped(*columns[*column_index], in, format_settings); @@ -188,7 +188,7 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens } /// skip separators - if (input_position + 1 < column_indexes_for_input_fields.size()) + if (file_column + 1 < column_indexes_for_input_fields.size()) { assertChar('\t', in); } @@ -208,20 +208,20 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { - for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position) + for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column) { - if (input_position == 0 && in.eof()) + if (file_column == 0 && in.eof()) { out << "\n"; return false; } - if (column_indexes_for_input_fields[input_position].has_value()) + if (column_indexes_for_input_fields[file_column].has_value()) { auto & header = getPort().getHeader(); - size_t col_idx = column_indexes_for_input_fields[input_position].value(); + size_t col_idx = column_indexes_for_input_fields[file_column].value(); if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], - out, input_position)) + out, file_column)) return false; } else @@ -229,12 +229,12 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & static const String skipped_column_str = ""; static const DataTypePtr skipped_column_type = std::make_shared(); static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); - if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, input_position)) + if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) return false; } /// Delimiters - if (input_position + 1 == column_indexes_for_input_fields.size()) + if (file_column + 1 == column_indexes_for_input_fields.size()) { if (!in.eof()) { @@ -277,7 +277,8 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & { out << "ERROR: Line feed found where tab is expected." " It's like your file has less columns than expected.\n" - "And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n"; + "And if your file have right number of columns, " + "maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n"; } else if (*in.position() == '\r') { @@ -297,12 +298,11 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & return true; } -void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, - ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) +void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, + ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) { prev_pos = in.position(); - if (column_indexes_for_input_fields[input_position]) + if (column_indexes_for_input_fields[file_column]) type->deserializeAsTextEscaped(column, in, format_settings); else { @@ -329,7 +329,7 @@ void registerInputFormatProcessorTabSeparated(FormatFactory & factory) IRowInputFormat::Params params, const FormatSettings & settings) { - return std::make_shared(buf, sample, false, false, std::move(params), settings); + return std::make_shared(sample, buf, params, false, false, settings); }); } @@ -342,7 +342,7 @@ void registerInputFormatProcessorTabSeparated(FormatFactory & factory) IRowInputFormat::Params params, const FormatSettings & settings) { - return std::make_shared(buf, sample, true, false, std::move(params), settings); + return std::make_shared(sample, buf, params, true, false, settings); }); } @@ -355,7 +355,7 @@ void registerInputFormatProcessorTabSeparated(FormatFactory & factory) IRowInputFormat::Params params, const FormatSettings & settings) { - return std::make_shared(buf, sample, true, true, std::move(params), settings); + return std::make_shared(sample, buf, params, true, true, settings); }); } } diff --git a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 222dcfce473..a28ac62ed4f 100644 --- a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -8,9 +8,6 @@ namespace DB { -class ReadBuffer; - - /** A stream to input data in tsv format. */ class TabSeparatedRowInputFormat : public RowInputFormatWithDiagnosticInfo @@ -19,8 +16,8 @@ public: /** with_names - the first line is the header with the names of the columns * with_types - on the next line header with type names */ - TabSeparatedRowInputFormat( - ReadBuffer & in_, Block header_, bool with_names_, bool with_types_, Params params_, const FormatSettings & format_settings_); + TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, + bool with_names_, bool with_types_, const FormatSettings & format_settings_); String getName() const override { return "TabSeparatedRowInputFormat"; } @@ -49,8 +46,8 @@ private: void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext); bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) override; + void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, + ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override; bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } }; diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index d16b9ab53dd..cbaef1b0012 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -12,7 +12,7 @@ namespace ErrorCodes extern const int SYNTAX_ERROR; } -TemplateBlockOutputFormat::TemplateBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & settings_) +TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_) : IOutputFormat(header_, out_), settings(settings_) { auto & sample = getPort(PortKind::Main).getHeader(); @@ -237,7 +237,7 @@ void registerOutputFormatProcessorTemplate(FormatFactory & factory) FormatFactory::WriteCallback, const FormatSettings & settings) { - return std::make_shared(buf, sample, settings); + return std::make_shared(sample, buf, settings); }); } } diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h index 844595b422d..059313e02af 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h @@ -14,7 +14,7 @@ class TemplateBlockOutputFormat : public IOutputFormat { using ColumnFormat = ParsedTemplateFormatString::ColumnFormat; public: - TemplateBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & settings_); + TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_); String getName() const override { return "TemplateBlockOutputFormat"; } diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 300c7522efb..c2145f88e69 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -17,9 +17,9 @@ extern const int SYNTAX_ERROR; } -TemplateRowInputFormat::TemplateRowInputFormat(ReadBuffer & in_, const Block & header_, const Params & params_, - const FormatSettings & settings_, bool ignore_spaces_) - : RowInputFormatWithDiagnosticInfo(header_, in_, params_), buf(in_), data_types(header_.getDataTypes()), +TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, + const FormatSettings & settings_, bool ignore_spaces_) + : RowInputFormatWithDiagnosticInfo(header_, buf, params_), buf(in_), data_types(header_.getDataTypes()), settings(settings_), ignore_spaces(ignore_spaces_) { /// Parse format string for whole input @@ -311,22 +311,35 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col { out << "Suffix does not match: "; size_t last_successfully_parsed_idx = format_data_idx + 1; - bool catched = false; + const ReadBuffer::Position row_begin_pos = buf.position(); + bool caught = false; try { + PeekableReadBufferCheckpoint checkpoint{buf, true}; tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); } catch (Exception & e) { out << e.message() << " Near column " << last_successfully_parsed_idx; - catched = true; + caught = true; + } + if (!caught) + { + out << " There is some data after suffix (EOF expected, got "; + verbosePrintString(buf.position(), std::min(buf.buffer().end(), buf.position() + 16), out); + out << "). "; } - if (!catched) - out << " There is some data after suffix (EOF expected). "; out << " Format string (from format_schema): \n" << format.dump() << "\n"; - out << "Trying to parse next row, because suffix does not match:\n"; - out << "Using format string (from format_schema_rows): " << row_format.dump() << "\n"; + if (row_begin_pos != buf.position()) + { + /// Pointers to buffer memory were invalidated during checking for suffix + out << "\nCannot print more diagnostic info."; + return false; + } + + out << "\nUsing format string (from format_schema_rows): " << row_format.dump() << "\n"; + out << "\nTrying to parse next row, because suffix does not match:\n"; try { if (likely(row_num != 1)) @@ -400,14 +413,14 @@ void TemplateRowInputFormat::writeErrorStringForWrongDelimiter(WriteBuffer & out out << '\n'; } -void TemplateRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) +void TemplateRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, + ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) { prev_pos = buf.position(); - if (row_format.format_idx_to_column_idx[input_position]) - deserializeField(*type, column, row_format.formats[input_position]); + if (row_format.format_idx_to_column_idx[file_column]) + deserializeField(*type, column, row_format.formats[file_column]); else - skipField(row_format.formats[input_position]); + skipField(row_format.formats[file_column]); curr_pos = buf.position(); } @@ -499,7 +512,7 @@ void registerInputFormatProcessorTemplate(FormatFactory & factory) IRowInputFormat::Params params, const FormatSettings & settings) { - return std::make_shared(buf, sample, std::move(params), settings, ignore_spaces); + return std::make_shared(sample, buf, params, settings, ignore_spaces); }); } } diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 4932cbf5023..85247f5e5a3 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -15,8 +15,8 @@ class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo { using ColumnFormat = ParsedTemplateFormatString::ColumnFormat; public: - TemplateRowInputFormat(ReadBuffer & in_, const Block & header_, const Params & params_, - const FormatSettings & settings_, bool ignore_spaces_); + TemplateRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, + const FormatSettings & settings_, bool ignore_spaces_); String getName() const override { return "TemplateRowInputFormat"; } @@ -38,7 +38,7 @@ private: [[noreturn]] void throwUnexpectedEof(); bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, + void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override; bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override; void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim); diff --git a/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp index 4458f7f52e0..452cfa46acf 100644 --- a/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp +++ b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp @@ -84,12 +84,13 @@ String DB::RowInputFormatWithDiagnosticInfo::getDiagnosticInfo() return out.str(); } -bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, +bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name, + const DataTypePtr & type, IColumn & column, WriteBuffer & out, - size_t input_position) + size_t file_column) { - out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ') + out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ') << "name: " << alignedName(col_name, max_length_of_column_name) << "type: " << alignedName(type->getName(), max_length_of_data_type_name); @@ -99,7 +100,7 @@ bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co try { - tryDeserializeFiled(type, column, input_position, prev_position, curr_position); + tryDeserializeFiled(type, column, file_column, prev_position, curr_position); } catch (...) { @@ -139,7 +140,7 @@ bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co if (type->haveMaximumSizeOfValue()) { - if (isGarbageAfterField(input_position, curr_position)) + if (isGarbageAfterField(file_column, curr_position)) { out << "ERROR: garbage after " << type->getName() << ": "; verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out); diff --git a/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h index f335908cecf..98dea066436 100644 --- a/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h +++ b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h @@ -19,17 +19,14 @@ public: protected: void updateDiagnosticInfo(); bool deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, IColumn & column, - WriteBuffer & out, size_t input_position); + WriteBuffer & out, size_t file_column); String alignedName(const String & name, size_t max_length) const; virtual bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) = 0; - virtual void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t input_position, ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) = 0; + virtual void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, + ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) = 0; virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0; - //ReadBuffer & istr; - //Block header; - /// For convenient diagnostics in case of an error. size_t row_num = 0; From 4cbf095a4a51127c4e8f0c918bcfada7e5a25ce1 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 30 Aug 2019 21:21:54 +0300 Subject: [PATCH 30/43] update docs --- docs/en/interfaces/formats.md | 50 +++++++++++++++++------------------ docs/ru/interfaces/formats.md | 49 +++++++++++++++++----------------- 2 files changed, 49 insertions(+), 50 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 8333dbc5351..9590665df54 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -132,25 +132,26 @@ Format string `format_schema_rows` specifies rows format with the following synt `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), - `column_i` is a name of a column whose values are to be selected or inserted, + `column_i` is a name of a column whose values are to be selected or inserted (if empty, then column will be skipped), `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) - `Escaped` (similarly to `TSV`) - `Quoted` (similarly to `Values`) - `Raw` (without escaping, similarly to `TSVRaw`) + - `None` (no escaping rule, see further) - Escaping rule may be omitted and in this case `Escaped` will be used. `XML` and `Raw` are suitable only for output. + If escaping rule is omitted, then`None` will be used. `XML` and `Raw` are suitable only for output. So, for the following format string: - `Search phrase: ${SearchPhrase:Quoted}, count: ${c}, ad price: $$${price:JSON};` + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` - The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one. + The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) Format string `format_schema` has the same syntax as `format_schema_rows` and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: @@ -164,9 +165,9 @@ Format string `format_schema` has the same syntax as `format_schema_rows` and al - `rows_read` is the number of rows have been read - `bytes_read` is the number of bytes (uncompressed) have been read - The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified. The remaining placeholders may have any escaping rule specified. + The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. If the `format_schema` setting is an empty string, `${data}` is used as default value. - For insert queries `format_schema` must be like `some prefix ${data} some suffix` i.e. it must contain a single placeholder `data`. + For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). `Select` example: ```sql @@ -209,32 +210,31 @@ format_schema_rows_between_delimiter = '\n ' ``` `Insert` example: -```json -{"array": - [ - {"PageViews": 5, "UserID": "4324182021466249494", "Duration": 146, "Sign": -1}, - {"PageViews": 6, "UserID": "4324182021466249494", "Duration": 185, "Sign": 1} - ] -} +``` +Some header +Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 +Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 +Total rows: 2 ``` ```sql -cat data.json | ./clickhouse client --query "INSERT INTO UserActivity FORMAT Template SETTINGS format_schema = '{\"array\":\n [\n \${data}\n ]\n}', format_schema_rows = '{\"PageViews\": \${project:JSON}, \"UserID\": \${date:JSON}, \"Duration\": \${size:JSON}, \"Sign\": \${hits:JSON}}', format_schema_rows_between_delimiter = ',\n '" -``` -In this example, `"` and `$` are escaped with `\` to pass settings through the command line argument correctly. The settings may look like this without escaping: -``` -format_schema = '{"array": - [ - ${data} - ] -}', -format_schema_rows = '{"PageViews": ${PageViews:JSON}, "UserID": ${UserID:JSON}, "Duration": ${Duration:JSON}, "Sign": ${Sign:JSON}}', -format_schema_rows_between_delimiter = ',\n ' +INSERT INTO UserActivity FORMAT Template SETTINGS +format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', +format_schema_rows = 'Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV}' ``` +`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows: ` in suffix will be ignored. All delimiters in the input data must be strictly equal to delimiters in specified format strings. ## TemplateIgnoreSpaces {#templateignorespaces} -Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. This format is suitable only for input. +This format is suitable only for input. +Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. +It's possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): +```sql +INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS +format_schema = '{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}}', +format_schema_rows = '{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}}', +format_schema_rows_between_delimiter = ',' +``` ## TSKV {#tskv} diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 60081205ee0..8f9d100ad91 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -131,25 +131,26 @@ world `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, где `delimiter_i` - разделители между значениями (символ `$` в разделителе экранируется как `$$`), - `column_i` - имена столбцов, значения которых должны быть выведены или считаны, + `column_i` - имена столбцов, значения которых должны быть выведены или считаны (если имя не указано - столбец пропускается), `serializeAs_i` - тип экранирования для значений соответствующего столбца. Поддерживаются следующие типы экранирования: - `CSV`, `JSON`, `XML` (как в одноимённых форматах) - `Escaped` (как в `TSV`) - `Quoted` (как в `Values`) - `Raw` (без экранирования, как в `TSVRaw`) + - `None` (тип экранирования отсутствует, см. далее) - Тип экранирования для столбца можно не указывать, в таком случае используется `Escaped`. `XML` и `Raw` поддерживаются только для вывода. + Если для столбца не указан тип экранирования, используется `None`. `XML` и `Raw` поддерживаются только для вывода. Так, в форматной строке - `Search phrase: ${SearchPhrase:Quoted}, count: ${c}, ad price: $$${price:JSON};` + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` между разделителями `Search phrase: `, `, count: `, `, ad price: $` и `;` при выводе будут подставлены (при вводе - будут ожидаться) значения столбцов `SearchPhrase`, `c` и `price`, сериализованные как `Quoted`, `Escaped` и `JSON` соответственно, например: `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` - Настройка `format_schema_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. + Настройка `format_schema_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. По умолчанию `\n`. Форматная строка `format_schema` имеет аналогичный `format_schema_rows` синтаксис и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: @@ -163,9 +164,10 @@ world - `rows_read` - сколько строк было прочитано при выполнении запроса - `bytes_read` - сколько байт (несжатых) было прочитано при выполнении запроса - У подстановок `data`, `totals`, `min` и `max` не должны быть указаны типы экранирования. Остальные подстановки - это отдельные значения, для них может быть указан любой тип экранирования. + У подстановок `data`, `totals`, `min` и `max` не должны быть указаны типы экранирования (или должен быть указан `None`). Остальные подстановки - это отдельные значения, для них может быть указан любой тип экранирования. Если строка `format_schema` пустая, то по-умолчанию используется `${data}`. - При вводе форматная строка `format_schema` должна иметь вид `some prefix ${data} some suffix` т.е. содержать единственную подстановку `data`. + Из всех перечисленных подстановок форматная строка `format_schema` для ввода может содержать только `data`. + Также при вводе формат поддерживает пропуск значений столбцов и пропуск значений в префиксе и суффиксе (см. пример). Пример вывода: ```sql @@ -208,32 +210,29 @@ format_schema_rows_between_delimiter = '\n ' ``` Пример ввода: -```json -{"array": - [ - {"PageViews": 5, "UserID": "4324182021466249494", "Duration": 146, "Sign": -1}, - {"PageViews": 6, "UserID": "4324182021466249494", "Duration": 185, "Sign": 1} - ] -} +``` +Some header +Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 +Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 +Total rows: 2 ``` ```sql -cat data.json | ./clickhouse client --query "INSERT INTO UserActivity FORMAT Template SETTINGS format_schema = '{\"array\":\n [\n \${data}\n ]\n}', format_schema_rows = '{\"PageViews\": \${project:JSON}, \"UserID\": \${date:JSON}, \"Duration\": \${size:JSON}, \"Sign\": \${hits:JSON}}', format_schema_rows_between_delimiter = ',\n '" -``` -В данном примере экранирование `"` и `$` нужно, чтобы настройки корректно передались через аргумент командной строки. Без этих экранирований настройки могли бы выглядеть так: -``` -format_schema = '{"array": - [ - ${data} - ] -}', -format_schema_rows = '{"PageViews": ${PageViews:JSON}, "UserID": ${UserID:JSON}, "Duration": ${Duration:JSON}, "Sign": ${Sign:JSON}}', -format_schema_rows_between_delimiter = ',\n ' +INSERT INTO UserActivity FORMAT Template SETTINGS +format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', +format_schema_rows = 'Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV}' ``` +`PageViews`, `UserID`, `Duration` и `Sign` внутри подстановок - имена столбцов в таблице, в которую вставляются данные. Значения после `Useless field` в строках и значение после `\nTotal rows: ` в суффиксе будут проигнорированы. Все разделители во входных данных должны строго соответствовать разделителям в форматных строках. ## TemplateIgnoreSpaces {#templateignorespaces} -Отличается от формата `Template` тем, что пропускает пробельные символы между разделителями и значениями во входном потоке. При этом, если форматные строки содержат пробельные символы, эти символы будут ожидаться во входных данных. Подходит только для ввода. +Подходит только для ввода. Отличается от формата `Template` тем, что пропускает пробельные символы между разделителями и значениями во входном потоке. Также в этом формате можно указать пустые подстановки с типом экранирования `None` (`${}` или `${:None}`), чтобы разбить разделители на несколько частей, пробелы между которыми должны игнорироваться. Такие подстановки используются только для пропуска пробелов. С помощью этого формата можно считывать `JSON`, если значения столбцов в нём всегда идут в одном порядке в каждой строке. Например, для вставки данных из примера вывода формата [JSON](#json) в таблицу со столбцами `phrase` и `cnt` можно использовать следующий запрос: +```sql +INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS +format_schema = '{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}}', +format_schema_rows = '{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}}', +format_schema_rows_between_delimiter = ',' +``` ## TSKV {#tskv} From cd5d733f75b7d3b4c975132712819e6a323fce18 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 30 Aug 2019 22:08:23 +0300 Subject: [PATCH 31/43] rewrite unit test with gtest --- dbms/src/IO/tests/CMakeLists.txt | 3 - .../IO/tests/gtest_peekable_read_buffer.cpp | 131 +++++++++++++++++ dbms/src/IO/tests/peekable_read_buffer.cpp | 138 ------------------ 3 files changed, 131 insertions(+), 141 deletions(-) create mode 100644 dbms/src/IO/tests/gtest_peekable_read_buffer.cpp delete mode 100644 dbms/src/IO/tests/peekable_read_buffer.cpp diff --git a/dbms/src/IO/tests/CMakeLists.txt b/dbms/src/IO/tests/CMakeLists.txt index 9ba5659449a..2c3dc307b18 100644 --- a/dbms/src/IO/tests/CMakeLists.txt +++ b/dbms/src/IO/tests/CMakeLists.txt @@ -77,9 +77,6 @@ target_link_libraries (limit_read_buffer2 PRIVATE clickhouse_common_io) add_executable (parse_date_time_best_effort parse_date_time_best_effort.cpp) target_link_libraries (parse_date_time_best_effort PRIVATE clickhouse_common_io) -add_executable (peekable_read_buffer peekable_read_buffer.cpp) -target_link_libraries (peekable_read_buffer PRIVATE clickhouse_common_io) - add_executable (zlib_ng_bug zlib_ng_bug.cpp) target_link_libraries (zlib_ng_bug PRIVATE ${Poco_Foundation_LIBRARY}) if(NOT USE_INTERNAL_POCO_LIBRARY) diff --git a/dbms/src/IO/tests/gtest_peekable_read_buffer.cpp b/dbms/src/IO/tests/gtest_peekable_read_buffer.cpp new file mode 100644 index 00000000000..331184e701c --- /dev/null +++ b/dbms/src/IO/tests/gtest_peekable_read_buffer.cpp @@ -0,0 +1,131 @@ +#include + +#include +#include +#include +#include +#include + +void readAndAssert(DB::ReadBuffer & buf, const char * str) +{ + size_t n = strlen(str); + char tmp[n]; + buf.readStrict(tmp, n); + ASSERT_EQ(strncmp(tmp, str, n), 0); +} + +void assertAvailable(DB::ReadBuffer & buf, const char * str) +{ + size_t n = strlen(str); + ASSERT_EQ(buf.available(), n); + ASSERT_EQ(strncmp(buf.position(), str, n), 0); +} + +TEST(PeekableReadBuffer, CheckpointsWorkCorrectly) +try +{ + std::string s1 = "0123456789"; + std::string s2 = "qwertyuiop"; + std::string s3 = "asdfghjkl;"; + std::string s4 = "zxcvbnm,./"; + DB::ReadBufferFromString b1(s1); + DB::ReadBufferFromString b2(s2); + DB::ReadBufferFromString b3(s3); + DB::ReadBufferFromString b4(s4); + + DB::ConcatReadBuffer concat({&b1, &b2, &b3, &b4}); + DB::PeekableReadBuffer peekable(concat, 0, 16); + + ASSERT_TRUE(!peekable.eof()); + assertAvailable(peekable, "0123456789"); + { + DB::PeekableReadBufferCheckpoint checkpoint{peekable}; + readAndAssert(peekable, "01234"); + } + bool exception = false; + try + { + peekable.rollbackToCheckpoint(); + } + catch (DB::Exception & e) + { + if (e.code() != DB::ErrorCodes::LOGICAL_ERROR) + throw; + exception = true; + } + ASSERT_TRUE(exception); + assertAvailable(peekable, "56789"); + + readAndAssert(peekable, "56"); + + peekable.setCheckpoint(); + readAndAssert(peekable, "789qwertyu"); + peekable.rollbackToCheckpoint(); + peekable.dropCheckpoint(); + assertAvailable(peekable, "789"); + peekable.peekNext(); + assertAvailable(peekable, "789qwertyuiop"); + ASSERT_EQ(peekable.lastPeeked().size(), 10); + ASSERT_EQ(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10), 0); + + exception = false; + try + { + DB::PeekableReadBufferCheckpoint checkpoint{peekable, true}; + peekable.ignore(30); + } + catch (DB::Exception & e) + { + if (e.code() != DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED) + throw; + exception = true; + } + ASSERT_TRUE(exception); + assertAvailable(peekable, "789qwertyuiop"); + ASSERT_EQ(peekable.lastPeeked().size(), 10); + ASSERT_EQ(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10), 0); + + readAndAssert(peekable, "789qwertyu"); + peekable.setCheckpoint(); + readAndAssert(peekable, "iopasdfghj"); + assertAvailable(peekable, "kl;"); + peekable.dropCheckpoint(); + + peekable.setCheckpoint(); + readAndAssert(peekable, "kl;zxcvbnm,./"); + ASSERT_TRUE(peekable.eof()); + ASSERT_TRUE(peekable.eof()); + ASSERT_TRUE(peekable.eof()); + peekable.rollbackToCheckpoint(); + readAndAssert(peekable, "kl;zxcvbnm"); + peekable.dropCheckpoint(); + + exception = false; + try + { + peekable.assertCanBeDestructed(); + } + catch (DB::Exception & e) + { + if (e.code() != DB::ErrorCodes::LOGICAL_ERROR) + throw; + exception = true; + } + ASSERT_TRUE(exception); + + auto buf_ptr = peekable.takeUnreadData(); + ASSERT_TRUE(peekable.eof()); + ASSERT_TRUE(peekable.eof()); + ASSERT_TRUE(peekable.eof()); + + readAndAssert(*buf_ptr, ",./"); + ASSERT_TRUE(buf_ptr->eof()); + + peekable.assertCanBeDestructed(); +} +catch (const DB::Exception & e) +{ + std::cerr << e.what() << ", " << e.displayText() << std::endl; + throw; +} + diff --git a/dbms/src/IO/tests/peekable_read_buffer.cpp b/dbms/src/IO/tests/peekable_read_buffer.cpp deleted file mode 100644 index d9149f5377d..00000000000 --- a/dbms/src/IO/tests/peekable_read_buffer.cpp +++ /dev/null @@ -1,138 +0,0 @@ -#include -#include -#include -#include -#include - -void assertTrue(bool b) -{ - if (!b) - throw DB::Exception("assert failed", DB::ErrorCodes::LOGICAL_ERROR); -} - -void readAndAssert(DB::ReadBuffer & buf, const char * str) -{ - size_t n = strlen(str); - char tmp[n]; - buf.readStrict(tmp, n); - assertTrue(strncmp(tmp, str, n) == 0); -} - -void assertAvailable(DB::ReadBuffer & buf, const char * str) -{ - size_t n = strlen(str); - assertTrue(buf.available() == n); - assertTrue(strncmp(buf.position(), str, n) == 0); -} - -int main(int, char **) -{ - try - { - std::string s1 = "0123456789"; - std::string s2 = "qwertyuiop"; - std::string s3 = "asdfghjkl;"; - std::string s4 = "zxcvbnm,./"; - DB::ReadBufferFromString b1(s1); - DB::ReadBufferFromString b2(s2); - DB::ReadBufferFromString b3(s3); - DB::ReadBufferFromString b4(s4); - - DB::ConcatReadBuffer concat({&b1, &b2, &b3, &b4}); - DB::PeekableReadBuffer peekable(concat, 0, 16); - - assertTrue(!peekable.eof()); - assertAvailable(peekable, "0123456789"); - { - DB::PeekableReadBufferCheckpoint checkpoint{peekable}; - readAndAssert(peekable, "01234"); - } - bool exception = false; - try - { - peekable.rollbackToCheckpoint(); - } - catch (DB::Exception & e) - { - if (e.code() != DB::ErrorCodes::LOGICAL_ERROR) - throw; - exception = true; - } - assertTrue(exception); - assertAvailable(peekable, "56789"); - - readAndAssert(peekable, "56"); - - peekable.setCheckpoint(); - readAndAssert(peekable, "789qwertyu"); - peekable.rollbackToCheckpoint(); - peekable.dropCheckpoint(); - assertAvailable(peekable, "789"); - peekable.peekNext(); - assertAvailable(peekable, "789qwertyuiop"); - assertTrue(peekable.lastPeeked().size() == 10); - assertTrue(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10) == 0); - - exception = false; - try - { - DB::PeekableReadBufferCheckpoint checkpoint{peekable, true}; - peekable.ignore(30); - } - catch (DB::Exception & e) - { - if (e.code() != DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED) - throw; - exception = true; - } - assertTrue(exception); - assertAvailable(peekable, "789qwertyuiop"); - assertTrue(peekable.lastPeeked().size() == 10); - assertTrue(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10) == 0); - - readAndAssert(peekable, "789qwertyu"); - peekable.setCheckpoint(); - readAndAssert(peekable, "iopasdfghj"); - assertAvailable(peekable, "kl;"); - peekable.dropCheckpoint(); - - peekable.setCheckpoint(); - readAndAssert(peekable, "kl;zxcvbnm,./"); - assertTrue(peekable.eof()); - assertTrue(peekable.eof()); - assertTrue(peekable.eof()); - peekable.rollbackToCheckpoint(); - readAndAssert(peekable, "kl;zxcvbnm"); - peekable.dropCheckpoint(); - - exception = false; - try - { - peekable.assertCanBeDestructed(); - } - catch (DB::Exception & e) - { - if (e.code() != DB::ErrorCodes::LOGICAL_ERROR) - throw; - exception = true; - } - assertTrue(exception); - - auto buf_ptr = peekable.takeUnreadData(); - assertTrue(peekable.eof()); - assertTrue(peekable.eof()); - assertTrue(peekable.eof()); - - readAndAssert(*buf_ptr, ",./"); - assertTrue(buf_ptr->eof()); - - peekable.assertCanBeDestructed(); - } - catch (const DB::Exception & e) - { - std::cerr << e.what() << ", " << e.displayText() << std::endl; - return 1; - } - - return 0; -} From 23b28e6cdc322b136e37419771de094302727f8d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 1 Sep 2019 02:35:33 +0300 Subject: [PATCH 32/43] Added another test for deadlock --- .../01007_r1r2_w_r2r1_deadlock.reference | 0 .../0_stateless/01007_r1r2_w_r2r1_deadlock.sh | 61 +++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/01007_r1r2_w_r2r1_deadlock.reference create mode 100755 dbms/tests/queries/0_stateless/01007_r1r2_w_r2r1_deadlock.sh diff --git a/dbms/tests/queries/0_stateless/01007_r1r2_w_r2r1_deadlock.reference b/dbms/tests/queries/0_stateless/01007_r1r2_w_r2r1_deadlock.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/01007_r1r2_w_r2r1_deadlock.sh b/dbms/tests/queries/0_stateless/01007_r1r2_w_r2r1_deadlock.sh new file mode 100755 index 00000000000..e28cf5d9f7b --- /dev/null +++ b/dbms/tests/queries/0_stateless/01007_r1r2_w_r2r1_deadlock.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +set -e + +$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS a" +$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS b" + +$CLICKHOUSE_CLIENT --query "CREATE TABLE a (x UInt8) ENGINE = MergeTree ORDER BY tuple()" +$CLICKHOUSE_CLIENT --query "CREATE TABLE b (x UInt8) ENGINE = MergeTree ORDER BY tuple()" + + +function thread1() +{ + while true; do + seq 1 100 | awk '{ print "SELECT x FROM a WHERE x IN (SELECT toUInt8(count()) FROM system.tables);" }' | $CLICKHOUSE_CLIENT -n + done +} + +function thread2() +{ + while true; do + seq 1 100 | awk '{ print "SELECT x FROM b WHERE x IN (SELECT toUInt8(count()) FROM system.tables);" }' | $CLICKHOUSE_CLIENT -n + done +} + +function thread3() +{ + while true; do + $CLICKHOUSE_CLIENT --query "ALTER TABLE a MODIFY COLUMN x Nullable(UInt8)" + $CLICKHOUSE_CLIENT --query "ALTER TABLE a MODIFY COLUMN x UInt8" + done +} + +function thread4() +{ + while true; do + $CLICKHOUSE_CLIENT --query "ALTER TABLE b MODIFY COLUMN x Nullable(UInt8)" + $CLICKHOUSE_CLIENT --query "ALTER TABLE b MODIFY COLUMN x UInt8" + done +} + +# https://stackoverflow.com/questions/9954794/execute-a-shell-function-with-timeout +export -f thread1; +export -f thread2; +export -f thread3; +export -f thread4; + +TIMEOUT=10 + +timeout $TIMEOUT bash -c thread1 2> /dev/null & +timeout $TIMEOUT bash -c thread2 2> /dev/null & +timeout $TIMEOUT bash -c thread3 2> /dev/null & +timeout $TIMEOUT bash -c thread4 2> /dev/null & + +wait + +$CLICKHOUSE_CLIENT --query "DROP TABLE a" +$CLICKHOUSE_CLIENT --query "DROP TABLE b" From 6f02cd145f656edb9401b45c46a69854e886871c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 1 Sep 2019 04:32:44 +0300 Subject: [PATCH 33/43] Added a kludge (experimental) --- dbms/src/Common/ErrorCodes.cpp | 1 + dbms/src/Common/RWLock.cpp | 75 +++++++++++++++++++++++- dbms/src/Common/tests/gtest_rw_lock.cpp | 76 +++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 2 deletions(-) diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index 87ab252c583..f1c03c5fd0e 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -447,6 +447,7 @@ namespace ErrorCodes extern const int QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW = 470; extern const int SETTINGS_ARE_NOT_SUPPORTED = 471; extern const int IMMUTABLE_SETTING = 472; + extern const int DEADLOCK_AVOIDED = 473; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Common/RWLock.cpp b/dbms/src/Common/RWLock.cpp index e343ce0b0cd..91d6e759c46 100644 --- a/dbms/src/Common/RWLock.cpp +++ b/dbms/src/Common/RWLock.cpp @@ -4,6 +4,8 @@ #include #include +#include + namespace ProfileEvents { @@ -29,6 +31,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int DEADLOCK_AVOIDED; } @@ -53,6 +56,44 @@ public: }; +namespace +{ + /// Global information about all read locks that query has. It is needed to avoid some type of deadlocks. + + class QueryLockInfo + { + private: + std::mutex mutex; + std::map queries; + + public: + void add(const String & query_id) + { + std::lock_guard lock(mutex); + ++queries[query_id]; + } + + void remove(const String & query_id) + { + std::lock_guard lock(mutex); + auto it = queries.find(query_id); + assert(it != queries.end()); + if (--it->second == 0) + queries.erase(it); + } + + void check(const String & query_id) + { + std::lock_guard lock(mutex); + if (queries.count(query_id)) + throw Exception("Deadlock avoided. Client must retry.", ErrorCodes::DEADLOCK_AVOIDED); + } + }; + + QueryLockInfo all_read_locks; +} + + RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & query_id) { Stopwatch watch(CLOCK_MONOTONIC_COARSE); @@ -95,8 +136,26 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & return existing_holder_ptr; } + /** If the query already has any active read lock and tries to acquire another read lock + * but it is not in front of the queue and has to wait, deadlock is possible: + * + * Example (four queries, two RWLocks - 'a' and 'b'): + * + * --> time --> + * + * q1: ra rb + * q2: wa + * q3: rb ra + * q4: wb + * + * We will throw an exception instead. + */ + if (type == Type::Write || queue.empty() || queue.back().type == Type::Write) { + if (queue.back().type == Type::Write && query_id != RWLockImpl::NO_QUERY) + all_read_locks.check(query_id); + /// Create new group of clients it_group = queue.emplace(queue.end(), type); } @@ -104,6 +163,9 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & { /// Will append myself to last group it_group = std::prev(queue.end()); + + if (it_group != queue.begin() && query_id != RWLockImpl::NO_QUERY) + all_read_locks.check(query_id); } /// Append myself to the end of chosen group @@ -130,7 +192,12 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & res->thread_id = this_thread_id; if (query_id != RWLockImpl::NO_QUERY) + { query_id_to_holder.emplace(query_id, res); + + if (type == Type::Read) + all_read_locks.add(query_id); + } res->query_id = query_id; finalize_metrics(); @@ -140,12 +207,15 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & RWLockImpl::LockHolderImpl::~LockHolderImpl() { - std::unique_lock lock(parent->mutex); + std::lock_guard lock(parent->mutex); /// Remove weak_ptrs to the holder, since there are no owners of the current lock parent->thread_to_holder.erase(thread_id); parent->query_id_to_holder.erase(query_id); + if (*it_client == RWLockImpl::Read && query_id != RWLockImpl::NO_QUERY) + all_read_locks.remove(query_id); + /// Removes myself from client list of our group it_group->clients.erase(it_client); @@ -166,6 +236,7 @@ RWLockImpl::LockHolderImpl::LockHolderImpl(RWLock && parent_, RWLockImpl::Groups : parent{std::move(parent_)}, it_group{it_group_}, it_client{it_client_}, active_client_increment{(*it_client == RWLockImpl::Read) ? CurrentMetrics::RWLockActiveReaders : CurrentMetrics::RWLockActiveWriters} -{} +{ +} } diff --git a/dbms/src/Common/tests/gtest_rw_lock.cpp b/dbms/src/Common/tests/gtest_rw_lock.cpp index 68927c8bc4a..6826c885ae0 100644 --- a/dbms/src/Common/tests/gtest_rw_lock.cpp +++ b/dbms/src/Common/tests/gtest_rw_lock.cpp @@ -13,6 +13,14 @@ using namespace DB; +namespace DB +{ + namespace ErrorCodes + { + extern const int DEADLOCK_AVOIDED; + } +} + TEST(Common, RWLock_1) { @@ -123,6 +131,74 @@ TEST(Common, RWLock_Recursive) } +TEST(Common, RWLock_Deadlock) +{ + static auto lock1 = RWLockImpl::create(); + static auto lock2 = RWLockImpl::create(); + + /** + * q1: r1 r2 + * q2: w1 + * q3: r2 r1 + * q4: w2 + */ + + std::thread t1([&] () + { + auto holder1 = lock1->getLock(RWLockImpl::Read, "q1"); + usleep(100000); + usleep(100000); + usleep(100000); + try + { + auto holder2 = lock2->getLock(RWLockImpl::Read, "q1"); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::DEADLOCK_AVOIDED) + throw; + } + }); + + std::thread t2([&] () + { + usleep(100000); + auto holder1 = lock1->getLock(RWLockImpl::Write, "q2"); + }); + + std::thread t3([&] () + { + usleep(100000); + usleep(100000); + auto holder2 = lock2->getLock(RWLockImpl::Read, "q3"); + usleep(100000); + usleep(100000); + try + { + auto holder1 = lock1->getLock(RWLockImpl::Read, "q3"); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::DEADLOCK_AVOIDED) + throw; + } + }); + + std::thread t4([&] () + { + usleep(100000); + usleep(100000); + usleep(100000); + auto holder2 = lock2->getLock(RWLockImpl::Write, "q4"); + }); + + t1.join(); + t2.join(); + t3.join(); + t4.join(); +} + + TEST(Common, RWLock_PerfTest_Readers) { constexpr int cycles = 100000; // 100k From 27d753eab7d270bb1e009a950ab661727f5ccd7c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 1 Sep 2019 15:16:31 +0300 Subject: [PATCH 34/43] Fixed error --- dbms/src/Common/RWLock.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/RWLock.cpp b/dbms/src/Common/RWLock.cpp index 91d6e759c46..76671226089 100644 --- a/dbms/src/Common/RWLock.cpp +++ b/dbms/src/Common/RWLock.cpp @@ -153,7 +153,7 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & if (type == Type::Write || queue.empty() || queue.back().type == Type::Write) { - if (queue.back().type == Type::Write && query_id != RWLockImpl::NO_QUERY) + if (type == Type::Read && !queue.empty() && queue.back().type == Type::Write && query_id != RWLockImpl::NO_QUERY) all_read_locks.check(query_id); /// Create new group of clients From 6940395c5d039330c66481e636b7f99125a1e41a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 1 Sep 2019 15:40:06 +0300 Subject: [PATCH 35/43] Fixed tests --- dbms/tests/queries/0_stateless/00763_lock_buffer.sh | 2 +- .../00763_long_lock_buffer_alter_destination_table.sh | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dbms/tests/queries/0_stateless/00763_lock_buffer.sh b/dbms/tests/queries/0_stateless/00763_lock_buffer.sh index 4ec4875e3e2..3146ce96132 100755 --- a/dbms/tests/queries/0_stateless/00763_lock_buffer.sh +++ b/dbms/tests/queries/0_stateless/00763_lock_buffer.sh @@ -17,7 +17,7 @@ function thread1() function thread2() { - seq 1 1000 | sed -r -e 's/.+/SELECT count() FROM test.buffer_00763_2;/' | ${CLICKHOUSE_CLIENT} --multiquery --server_logs_file='/dev/null' --ignore-error 2>&1 | grep -vP '^0$|^10$|^Received exception|^Code: 60|^Code: 218' + seq 1 1000 | sed -r -e 's/.+/SELECT count() FROM test.buffer_00763_2;/' | ${CLICKHOUSE_CLIENT} --multiquery --server_logs_file='/dev/null' --ignore-error 2>&1 | grep -vP '^0$|^10$|^Received exception|^Code: 60|^Code: 218|^Code: 473' } thread1 & diff --git a/dbms/tests/queries/0_stateless/00763_long_lock_buffer_alter_destination_table.sh b/dbms/tests/queries/0_stateless/00763_long_lock_buffer_alter_destination_table.sh index ba50d4e9f04..059d70253b2 100755 --- a/dbms/tests/queries/0_stateless/00763_long_lock_buffer_alter_destination_table.sh +++ b/dbms/tests/queries/0_stateless/00763_long_lock_buffer_alter_destination_table.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash set -e +CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=none + CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . $CURDIR/../shell_config.sh @@ -18,7 +20,7 @@ function thread1() function thread2() { - seq 1 2000 | sed -r -e 's/.+/SELECT sum(length(s)) FROM test.buffer_00763_1;/' | ${CLICKHOUSE_CLIENT} --multiquery --server_logs_file='/dev/null' --ignore-error 2>&1 | grep -vP '^3$' + seq 1 2000 | sed -r -e 's/.+/SELECT sum(length(s)) FROM test.buffer_00763_1;/' | ${CLICKHOUSE_CLIENT} --multiquery --ignore-error 2>&1 | grep -vP '(^3$|^Received exception from server|^Code: 473)' } thread1 & From de44330cfa98d49a510299a0f761aa89599ba7bb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 1 Sep 2019 22:21:00 +0300 Subject: [PATCH 36/43] Updated exception message --- dbms/src/Common/RWLock.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/RWLock.cpp b/dbms/src/Common/RWLock.cpp index 76671226089..0b9d0918000 100644 --- a/dbms/src/Common/RWLock.cpp +++ b/dbms/src/Common/RWLock.cpp @@ -86,7 +86,7 @@ namespace { std::lock_guard lock(mutex); if (queries.count(query_id)) - throw Exception("Deadlock avoided. Client must retry.", ErrorCodes::DEADLOCK_AVOIDED); + throw Exception("Possible deadlock avoided. Client should retry.", ErrorCodes::DEADLOCK_AVOIDED); } }; From 2d331fb992114364f86807f0be5723c12ad537d7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 2 Sep 2019 03:12:01 +0300 Subject: [PATCH 37/43] Fixed possible deadlock in RWLock --- dbms/src/Common/RWLock.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/dbms/src/Common/RWLock.cpp b/dbms/src/Common/RWLock.cpp index 0b9d0918000..c93f413be16 100644 --- a/dbms/src/Common/RWLock.cpp +++ b/dbms/src/Common/RWLock.cpp @@ -110,11 +110,12 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & GroupsContainer::iterator it_group; ClientsContainer::iterator it_client; + /// This object is placed above unique_lock, because it may lock in destructor. + LockHolder res; + std::unique_lock lock(mutex); /// Check if the same query is acquiring previously acquired lock - LockHolder existing_holder_ptr; - auto this_thread_id = std::this_thread::get_id(); auto it_thread = thread_to_holder.find(this_thread_id); @@ -123,17 +124,17 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & it_query = query_id_to_holder.find(query_id); if (it_thread != thread_to_holder.end()) - existing_holder_ptr = it_thread->second.lock(); + res = it_thread->second.lock(); else if (it_query != query_id_to_holder.end()) - existing_holder_ptr = it_query->second.lock(); + res = it_query->second.lock(); - if (existing_holder_ptr) + if (res) { /// XXX: it means we can't upgrade lock from read to write - with proper waiting! - if (type != Read || existing_holder_ptr->it_group->type != Read) + if (type != Read || res->it_group->type != Read) throw Exception("Attempt to acquire exclusive lock recursively", ErrorCodes::LOGICAL_ERROR); - return existing_holder_ptr; + return res; } /** If the query already has any active read lock and tries to acquire another read lock @@ -182,7 +183,7 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & throw; } - LockHolder res(new LockHolderImpl(shared_from_this(), it_group, it_client)); + res.reset(new LockHolderImpl(shared_from_this(), it_group, it_client)); /// Wait a notification until we will be the only in the group. it_group->cv.wait(lock, [&] () { return it_group == queue.begin(); }); From e5ad85ba2655d62c47d59376645226de7f7561cd Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 2 Sep 2019 03:21:42 +0300 Subject: [PATCH 38/43] Fixed test --- .../queries/0_stateless/00838_system_tables_drop_table_race.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/tests/queries/0_stateless/00838_system_tables_drop_table_race.sh b/dbms/tests/queries/0_stateless/00838_system_tables_drop_table_race.sh index 06a6b8f7341..9fd84ab1034 100755 --- a/dbms/tests/queries/0_stateless/00838_system_tables_drop_table_race.sh +++ b/dbms/tests/queries/0_stateless/00838_system_tables_drop_table_race.sh @@ -8,6 +8,6 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS table" seq 1 100 | sed -r -e "s/.+/CREATE TABLE table (x UInt8) ENGINE = MergeTree ORDER BY x; DROP TABLE table;/" | $CLICKHOUSE_CLIENT -n & -seq 1 100 | sed -r -e "s/.+/SELECT * FROM system.tables WHERE database = '${CLICKHOUSE_DATABASE}' LIMIT 1000000, 1;/" | $CLICKHOUSE_CLIENT -n & +seq 1 100 | sed -r -e "s/.+/SELECT * FROM system.tables WHERE database = '${CLICKHOUSE_DATABASE}' LIMIT 1000000, 1;/" | $CLICKHOUSE_CLIENT -n 2>/dev/null & wait From 85f45747a5cd29f80a43b3aef8d82960f8270044 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 2 Sep 2019 04:04:41 +0300 Subject: [PATCH 39/43] Fixed test --- dbms/src/Common/RWLock.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/dbms/src/Common/RWLock.cpp b/dbms/src/Common/RWLock.cpp index c93f413be16..c3e969e1b5f 100644 --- a/dbms/src/Common/RWLock.cpp +++ b/dbms/src/Common/RWLock.cpp @@ -123,18 +123,30 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & if (query_id != RWLockImpl::NO_QUERY) it_query = query_id_to_holder.find(query_id); + bool recursive_by_query_id = false; if (it_thread != thread_to_holder.end()) + { res = it_thread->second.lock(); + } else if (it_query != query_id_to_holder.end()) + { + recursive_by_query_id = true; res = it_query->second.lock(); + } if (res) { /// XXX: it means we can't upgrade lock from read to write - with proper waiting! if (type != Read || res->it_group->type != Read) - throw Exception("Attempt to acquire exclusive lock recursively", ErrorCodes::LOGICAL_ERROR); + { + if (recursive_by_query_id) + throw Exception("Attempt to acquire exclusive lock recursively", ErrorCodes::LOGICAL_ERROR); - return res; + /// threads are reused between queries. If lock found by thread_id, it does not necessarily means that it's recursive. + res.reset(); + } + else + return res; } /** If the query already has any active read lock and tries to acquire another read lock From 3da233ef42265385587e510d4f8fab3f359f784d Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 2 Sep 2019 13:47:06 +0300 Subject: [PATCH 40/43] fix build --- dbms/src/{Parsers => Formats}/ParsedTemplateFormatString.cpp | 2 +- dbms/src/{Parsers => Formats}/ParsedTemplateFormatString.h | 0 dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h | 2 +- dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename dbms/src/{Parsers => Formats}/ParsedTemplateFormatString.cpp (99%) rename dbms/src/{Parsers => Formats}/ParsedTemplateFormatString.h (100%) diff --git a/dbms/src/Parsers/ParsedTemplateFormatString.cpp b/dbms/src/Formats/ParsedTemplateFormatString.cpp similarity index 99% rename from dbms/src/Parsers/ParsedTemplateFormatString.cpp rename to dbms/src/Formats/ParsedTemplateFormatString.cpp index 2fbbb8965fd..f89b1756693 100644 --- a/dbms/src/Parsers/ParsedTemplateFormatString.cpp +++ b/dbms/src/Formats/ParsedTemplateFormatString.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/dbms/src/Parsers/ParsedTemplateFormatString.h b/dbms/src/Formats/ParsedTemplateFormatString.h similarity index 100% rename from dbms/src/Parsers/ParsedTemplateFormatString.h rename to dbms/src/Formats/ParsedTemplateFormatString.h diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h index 059313e02af..9eb5f61d4e7 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace DB diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 85247f5e5a3..ff7b2adc34a 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include From ab2bd54022d064335d6e691c6519eab43715eee2 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Mon, 2 Sep 2019 16:17:14 +0300 Subject: [PATCH 41/43] Remove a redundant condition (found by PVS-Studio). --- dbms/src/Functions/GeoUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/GeoUtils.cpp b/dbms/src/Functions/GeoUtils.cpp index 847d934c6b4..b274f579aa3 100644 --- a/dbms/src/Functions/GeoUtils.cpp +++ b/dbms/src/Functions/GeoUtils.cpp @@ -332,7 +332,7 @@ UInt64 geohashesInBox(const GeohashesInBoxPreparedArgs & args, char * out) } } - if (items == 0 && args.items_count != 0) + if (items == 0) { size_t l = geohashEncodeImpl(args.longitude_min, args.latitude_min, args.precision, out); out += l; From 2eecd35d27d63877fc26c840e9d19554c4143738 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 2 Sep 2019 16:30:51 +0300 Subject: [PATCH 42/43] Fix PVS warning in PipelineExecutor. --- dbms/src/Processors/Executors/PipelineExecutor.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dbms/src/Processors/Executors/PipelineExecutor.cpp b/dbms/src/Processors/Executors/PipelineExecutor.cpp index 6aad6f96b5c..a10a7d267ec 100644 --- a/dbms/src/Processors/Executors/PipelineExecutor.cpp +++ b/dbms/src/Processors/Executors/PipelineExecutor.cpp @@ -287,12 +287,6 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, Stack & children, Stack & pa switch (node.last_processor_status) { case IProcessor::Status::NeedData: - { - add_neighbours_to_prepare_queue(); - try_release_ownership(); - - break; - } case IProcessor::Status::PortFull: { add_neighbours_to_prepare_queue(); From 14e16c7a41c1ef9d9fd3e773945ce23860cb4122 Mon Sep 17 00:00:00 2001 From: Denis Zhuravlev Date: Mon, 2 Sep 2019 12:11:45 -0300 Subject: [PATCH 43/43] Doc change. Example of another approach to collapsing (using negative numbers). (#6751) * Update collapsingmergetree.md Example of another approach to collapsing (using negative numbers). * Update collapsingmergetree.md Example of another approach to collapsing (using negative numbers). * Update collapsingmergetree.md Example of another approach to collapsing (using negative numbers). --- .../table_engines/collapsingmergetree.md | 62 +++++++++++++++++++ .../table_engines/collapsingmergetree.md | 60 ++++++++++++++++++ 2 files changed, 122 insertions(+) diff --git a/docs/en/operations/table_engines/collapsingmergetree.md b/docs/en/operations/table_engines/collapsingmergetree.md index b4f7bd8e6cf..67cad64e950 100644 --- a/docs/en/operations/table_engines/collapsingmergetree.md +++ b/docs/en/operations/table_engines/collapsingmergetree.md @@ -214,4 +214,66 @@ SELECT * FROM UAct FINAL This way of selecting the data is very inefficient. Don't use it for big tables. +## Example of another approach + +Example data: + +``` +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +│ 4324182021466249494 │ -5 │ -146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +The idea is that merges take into account only key fields. And in the "Cancel" line we can specify negative values that equalize the previous version of the row when summing without using the Sign column. For this approach, it is necessary to change the data type `PageViews`,` Duration` to store negative values of UInt8 -> Int16. + +```sql +CREATE TABLE UAct +( + UserID UInt64, + PageViews Int16, + Duration Int16, + Sign Int8 +) +ENGINE = CollapsingMergeTree(Sign) +ORDER BY UserID +``` + +Let's test the approach: +```sql +insert into UAct values(4324182021466249494, 5, 146, 1); +insert into UAct values(4324182021466249494, -5, -146, -1); +insert into UAct values(4324182021466249494, 6, 185, 1); + +select * from UAct final; // avoid using final in production (just for a test or small tables) +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ + +SELECT + UserID, + sum(PageViews) AS PageViews, + sum(Duration) AS Duration +FROM UAct +GROUP BY UserID +┌──────────────UserID─┬─PageViews─┬─Duration─┐ +│ 4324182021466249494 │ 6 │ 185 │ +└─────────────────────┴───────────┴──────────┘ + +select count() FROM UAct +┌─count()─┐ +│ 3 │ +└─────────┘ + +optimize table UAct final; + +select * FROM UAct +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + + + [Original article](https://clickhouse.yandex/docs/en/operations/table_engines/collapsingmergetree/) diff --git a/docs/ru/operations/table_engines/collapsingmergetree.md b/docs/ru/operations/table_engines/collapsingmergetree.md index 59fdf66324f..17e85bcca0c 100644 --- a/docs/ru/operations/table_engines/collapsingmergetree.md +++ b/docs/ru/operations/table_engines/collapsingmergetree.md @@ -220,5 +220,65 @@ SELECT * FROM UAct FINAL Такой способ выбора данных очень неэффективен. Не используйте его для больших таблиц. +## Пример другого подхода + +Исходные данные: + +``` +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +│ 4324182021466249494 │ -5 │ -146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` +Идея состоит в том, что слияния при сворачивании учитывают только ключевые поля, поэтому в отменяющей строке можно указать отрицательные значения, которые нивелируют предыдущую версию записи при суммировании без учета поля Sign. +Для этого подхода необходимо изменить тип данных `PageViews`, `Duration` для хранения отрицательных значений UInt8 -> Int16. + +```sql +CREATE TABLE UAct +( + UserID UInt64, + PageViews Int16, + Duration Int16, + Sign Int8 +) +ENGINE = CollapsingMergeTree(Sign) +ORDER BY UserID +``` + +Тестируем подход: +```sql +insert into UAct values(4324182021466249494, 5, 146, 1); +insert into UAct values(4324182021466249494, -5, -146, -1); +insert into UAct values(4324182021466249494, 6, 185, 1); + +select * from UAct final; // старайтесь не использовать final (он подходит только для тестов и маленьких таблиц) +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ + +SELECT + UserID, + sum(PageViews) AS PageViews, + sum(Duration) AS Duration +FROM UAct +GROUP BY UserID +┌──────────────UserID─┬─PageViews─┬─Duration─┐ +│ 4324182021466249494 │ 6 │ 185 │ +└─────────────────────┴───────────┴──────────┘ + +select count() FROM UAct +┌─count()─┐ +│ 3 │ +└─────────┘ + +optimize table UAct final; + +select * FROM UAct +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + [Оригинальная статья](https://clickhouse.yandex/docs/ru/operations/table_engines/collapsingmergetree/)