From 53dcce55e99e42440adf6baa75e4e41edab56376 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 21 Apr 2020 23:41:52 +0300 Subject: [PATCH] better diagnostic info in input formats --- src/IO/readDecimalText.h | 17 ++++----- src/Processors/Formats/IRowInputFormat.cpp | 10 ++++- .../Formats/Impl/CSVRowInputFormat.cpp | 11 ++---- .../Formats/Impl/CSVRowInputFormat.h | 5 +-- .../Impl/TabSeparatedRowInputFormat.cpp | 5 +-- .../Formats/Impl/TabSeparatedRowInputFormat.h | 3 +- .../Formats/Impl/TemplateRowInputFormat.cpp | 5 +-- .../Formats/Impl/TemplateRowInputFormat.h | 3 +- .../Formats/Impl/ValuesBlockInputFormat.cpp | 9 +++-- .../RowInputFormatWithDiagnosticInfo.cpp | 8 ++-- .../RowInputFormatWithDiagnosticInfo.h | 3 +- .../01195_formats_diagnostic_info.reference | 29 +++++++++++++++ .../01195_formats_diagnostic_info.sh | 37 +++++++++++++++++++ 13 files changed, 102 insertions(+), 43 deletions(-) create mode 100644 tests/queries/0_stateless/01195_formats_diagnostic_info.reference create mode 100755 tests/queries/0_stateless/01195_formats_diagnostic_info.sh diff --git a/src/IO/readDecimalText.h b/src/IO/readDecimalText.h index 6edc300eac8..c3c7edb2221 100644 --- a/src/IO/readDecimalText.h +++ b/src/IO/readDecimalText.h @@ -33,17 +33,14 @@ inline bool readDigits(ReadBuffer & buf, T & x, unsigned int & digits, int & exp return false; } - if (!buf.eof()) + switch (*buf.position()) { - switch (*buf.position()) - { - case '-': - sign = -1; - [[fallthrough]]; - case '+': - ++buf.position(); - break; - } + case '-': + sign = -1; + [[fallthrough]]; + case '+': + ++buf.position(); + break; } bool stop = false; diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp index 74f4dccaa86..76fc5b86339 100644 --- a/src/Processors/Formats/IRowInputFormat.cpp +++ b/src/Processors/Formats/IRowInputFormat.cpp @@ -18,6 +18,8 @@ namespace ErrorCodes extern const int CANNOT_PARSE_UUID; extern const int TOO_LARGE_STRING_SIZE; extern const int INCORRECT_NUMBER_OF_COLUMNS; + extern const int ARGUMENT_OUT_OF_BOUND; + extern const int INCORRECT_DATA; } @@ -30,7 +32,9 @@ bool isParseError(int code) || code == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT || code == ErrorCodes::CANNOT_PARSE_NUMBER || code == ErrorCodes::CANNOT_PARSE_UUID - || code == ErrorCodes::TOO_LARGE_STRING_SIZE; + || code == ErrorCodes::TOO_LARGE_STRING_SIZE + || code == ErrorCodes::ARGUMENT_OUT_OF_BOUND /// For Decimals + || code == ErrorCodes::INCORRECT_DATA; /// For some ReadHelpers } @@ -127,6 +131,10 @@ Chunk IRowInputFormat::generate() { verbose_diagnostic = getDiagnosticInfo(); } + catch (const Exception & exception) + { + verbose_diagnostic = "Cannot get verbose diagnostic: " + exception.message(); + } catch (...) { /// Error while trying to obtain verbose diagnostic. Ok to ignore. diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index bd16de69f75..7bca5f2c5d9 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -273,6 +273,7 @@ bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, return false; } + skipWhitespacesAndTabs(in); if (column_indexes_for_input_fields[file_column].has_value()) { const auto & header = getPort().getHeader(); @@ -289,6 +290,7 @@ bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) return false; } + skipWhitespacesAndTabs(in); /// Delimiters if (file_column + 1 == column_indexes_for_input_fields.size()) @@ -351,12 +353,8 @@ void CSVRowInputFormat::syncAfterError() skipToNextLineOrEOF(in); } -void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, - ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) +void CSVRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) { - skipWhitespacesAndTabs(in); - prev_pos = in.position(); - if (column_indexes_for_input_fields[file_column]) { const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size(); @@ -367,9 +365,6 @@ void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & String tmp; readCSVString(tmp, in, format_settings.csv); } - - curr_pos = in.position(); - skipWhitespacesAndTabs(in); } bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index 9916e80a079..c884eb6c3db 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -55,11 +55,10 @@ private: void addInputColumn(const String & column_name); bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, - ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override; + void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { - return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter; + return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter && *pos != ' ' && *pos != '\t'; } bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column); diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 014795cade1..dbbd1e28aa0 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -318,10 +318,8 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & return true; } -void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, - ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) +void TabSeparatedRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) { - prev_pos = in.position(); if (column_indexes_for_input_fields[file_column]) { const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size(); @@ -332,7 +330,6 @@ void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, I NullSink null_sink; readEscapedStringInto(null_sink, in); } - curr_pos = in.position(); } void TabSeparatedRowInputFormat::syncAfterError() diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 5bb0a636e1a..b0e52fdde93 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -50,8 +50,7 @@ private: void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension); bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, - ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override; + void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } }; diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 2a0fe6d6ec6..271b9ea75c5 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -410,15 +410,12 @@ void TemplateRowInputFormat::writeErrorStringForWrongDelimiter(WriteBuffer & out out << '\n'; } -void TemplateRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, - ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) +void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) { - prev_pos = buf.position(); if (row_format.format_idx_to_column_idx[file_column]) deserializeField(type, column, file_column); else skipField(row_format.formats[file_column]); - curr_pos = buf.position(); } bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position) diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 4a39396399d..efdf6eb5e6d 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -42,8 +42,7 @@ private: [[noreturn]] void throwUnexpectedEof(); bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, ReadBuffer::Position & prev_pos, - ReadBuffer::Position & curr_pos) override; + void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override; void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim); diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index 80428bd195a..25e93215ddf 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -20,10 +20,10 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; - extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED; extern const int SYNTAX_ERROR; extern const int TYPE_MISMATCH; extern const int SUPPORT_IS_DISABLED; + extern const int ARGUMENT_OUT_OF_BOUND; } @@ -167,7 +167,9 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx) } catch (const Exception & e) { - if (!isParseError(e.code()) && e.code() != ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED) + /// Do not consider decimal overflow as parse error to avoid attempts to parse it as expression with float literal + bool decimal_overflow = e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND; + if (!isParseError(e.code()) || decimal_overflow) throw; if (rollback_on_exception) column.popBack(1); @@ -226,7 +228,8 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx } catch (const Exception & e) { - if (!isParseError(e.code())) + bool decimal_overflow = e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND; + if (!isParseError(e.code()) || decimal_overflow) throw; } if (ok) diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp index ff40597073b..9cd2b14238c 100644 --- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp +++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp @@ -37,8 +37,8 @@ void RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo() String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo() { - if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed. - return {}; + if (in.eof()) + return "Buffer has gone, cannot extract information about what has been parsed."; WriteBufferFromOwnString out; @@ -102,17 +102,17 @@ bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co << "type: " << alignedName(type->getName(), max_length_of_data_type_name); auto * prev_position = in.position(); - auto * curr_position = in.position(); std::exception_ptr exception; try { - tryDeserializeFiled(type, column, file_column, prev_position, curr_position); + tryDeserializeField(type, column, file_column); } catch (...) { exception = std::current_exception(); } + auto * curr_position = in.position(); if (curr_position < prev_position) throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR); diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h index 1d502ddc281..5bad24cd482 100644 --- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h +++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h @@ -24,8 +24,7 @@ protected: WriteBuffer & out, size_t file_column); virtual bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) = 0; - virtual void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, - ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) = 0; + virtual void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) = 0; virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0; /// For convenient diagnostics in case of an error. diff --git a/tests/queries/0_stateless/01195_formats_diagnostic_info.reference b/tests/queries/0_stateless/01195_formats_diagnostic_info.reference new file mode 100644 index 00000000000..708841559c2 --- /dev/null +++ b/tests/queries/0_stateless/01195_formats_diagnostic_info.reference @@ -0,0 +1,29 @@ +CSV +Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR +ERROR: garbage after DateTime: "7, Hello" +ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format. +ERROR: There is no line feed. "1" found instead. +ERROR: garbage after Decimal(18, 10): "Hello" +Column 0, name: t, type: DateTime, ERROR: text "" is not like DateTime + +CustomSeparatedIgnoreSpaces +Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR +ERROR: There is no delimiter before field 1: expected ",", got "7, Hello," +Column 0, name: t, type: DateTime, ERROR: text ",1" is not like DateTime +Column 0, name: t, type: DateTime, ERROR: text "Hello" is not like DateTime +OK + +TSV +Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR +ERROR: garbage after DateTime: "7Hello12" +ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format. +ERROR: Tab found where line feed is expected. It's like your file has more columns than expected. +ERROR: garbage after Decimal(18, 10): "Hello" +Column 0, name: t, type: DateTime, ERROR: text "" is not like DateTime + +CustomSeparated +Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR +ERROR: There is no delimiter before field 1: expected "", got "7Hello123" +ERROR: There is no delimiter after last field: expected "", got "1" +ERROR: There is no delimiter after last field: expected "", got "Hello" +Column 0, name: t, type: DateTime, ERROR: text "" is not like DateTime diff --git a/tests/queries/0_stateless/01195_formats_diagnostic_info.sh b/tests/queries/0_stateless/01195_formats_diagnostic_info.sh new file mode 100755 index 00000000000..e07b03e9512 --- /dev/null +++ b/tests/queries/0_stateless/01195_formats_diagnostic_info.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CSV) +echo '2020-04-21 12:34:56, "Hello", 12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo "CSV" +echo '2020-04-21 12:34:56, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '2020-04-21 12:34:567, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" +#echo '2020-04-21, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" # DateTime parsing is unsafe, it produces unexpected result ("Hello" is parsed as time) +echo '2020-04-21 12:34:56, "Hello", 12345678,1' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '2020-04-21 12:34:56,,123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '2020-04-21 12:34:56, "Hello", 12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR" + +PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CustomSeparatedIgnoreSpaces --format_custom_escaping_rule CSV --format_custom_field_delimiter ',' --format_custom_row_after_delimiter "") +echo '2020-04-21 12:34:56, "Hello", 12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo -e "\nCustomSeparatedIgnoreSpaces" +echo '2020-04-21 12:34:56, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '2020-04-21 12:34:567, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '2020-04-21 12:34:56, "Hello", 12345678,1' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '2020-04-21 12:34:56,,123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '2020-04-21 12:34:56, "Hello", 12345678\n\n\n\n ' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo "OK" + +PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format TSV) +echo -e '2020-04-21 12:34:56\tHello\t12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo -e "\nTSV" +echo -e '2020-04-21 12:34:56\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR" + +PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CustomSeparated) +echo -e '2020-04-21 12:34:56\tHello\t12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo -e "\nCustomSeparated" +echo -e '2020-04-21 12:34:56\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"