better diagnostic info in input formats

2024-11-21 07:01:59 +00:00 · 2020-04-21 23:41:52 +03:00 · 2020-04-21 23:41:52 +03:00 · 53dcce55e9
commit 53dcce55e9
parent 388bcffec9
13 changed files with 102 additions and 43 deletions
--- a/src/IO/readDecimalText.h
+++ b/src/IO/readDecimalText.h
@ -33,17 +33,14 @@ inline bool readDigits(ReadBuffer & buf, T & x, unsigned int & digits, int & exp
        return false;
    }

-    if (!buf.eof())
+    switch (*buf.position())
    {
-        switch (*buf.position())
-        {
-            case '-':
-                sign = -1;
-                [[fallthrough]];
-            case '+':
-                ++buf.position();
-                break;
-        }
+        case '-':
+            sign = -1;
+            [[fallthrough]];
+        case '+':
+            ++buf.position();
+            break;
    }

    bool stop = false;
--- a/src/Processors/Formats/IRowInputFormat.cpp
+++ b/src/Processors/Formats/IRowInputFormat.cpp
@ -18,6 +18,8 @@ namespace ErrorCodes
    extern const int CANNOT_PARSE_UUID;
    extern const int TOO_LARGE_STRING_SIZE;
    extern const int INCORRECT_NUMBER_OF_COLUMNS;
+    extern const int ARGUMENT_OUT_OF_BOUND;
+    extern const int INCORRECT_DATA;
 }


@ -30,7 +32,9 @@ bool isParseError(int code)
        || code == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT
        || code == ErrorCodes::CANNOT_PARSE_NUMBER
        || code == ErrorCodes::CANNOT_PARSE_UUID
-        || code == ErrorCodes::TOO_LARGE_STRING_SIZE;
+        || code == ErrorCodes::TOO_LARGE_STRING_SIZE
+        || code == ErrorCodes::ARGUMENT_OUT_OF_BOUND       /// For Decimals
+        || code == ErrorCodes::INCORRECT_DATA;             /// For some ReadHelpers
 }


@ -127,6 +131,10 @@ Chunk IRowInputFormat::generate()
        {
            verbose_diagnostic = getDiagnosticInfo();
        }
+        catch (const Exception & exception)
+        {
+            verbose_diagnostic = "Cannot get verbose diagnostic: " + exception.message();
+        }
        catch (...)
        {
            /// Error while trying to obtain verbose diagnostic. Ok to ignore.
--- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
@ -273,6 +273,7 @@ bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
            return false;
        }

+        skipWhitespacesAndTabs(in);
        if (column_indexes_for_input_fields[file_column].has_value())
        {
            const auto & header = getPort().getHeader();
@ -289,6 +290,7 @@ bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
            if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column))
                return false;
        }
+        skipWhitespacesAndTabs(in);

        /// Delimiters
        if (file_column + 1 == column_indexes_for_input_fields.size())
@ -351,12 +353,8 @@ void CSVRowInputFormat::syncAfterError()
    skipToNextLineOrEOF(in);
 }

-void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
-                                            ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
+void CSVRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
 {
-    skipWhitespacesAndTabs(in);
-    prev_pos = in.position();
-
    if (column_indexes_for_input_fields[file_column])
    {
        const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
@ -367,9 +365,6 @@ void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn &
        String tmp;
        readCSVString(tmp, in, format_settings.csv);
    }
-
-    curr_pos = in.position();
-    skipWhitespacesAndTabs(in);
 }

 bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column)
--- a/src/Processors/Formats/Impl/CSVRowInputFormat.h
+++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h
@ -55,11 +55,10 @@ private:
    void addInputColumn(const String & column_name);

    bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
-    void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
-                             ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override;
+    void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
    bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override
    {
-        return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter;
+        return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter && *pos != ' ' && *pos != '\t';
    }

    bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column);
--- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp
@ -318,10 +318,8 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns &
    return true;
 }

-void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
-                                                     ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
+void TabSeparatedRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
 {
-    prev_pos = in.position();
    if (column_indexes_for_input_fields[file_column])
    {
        const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
@ -332,7 +330,6 @@ void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, I
        NullSink null_sink;
        readEscapedStringInto(null_sink, in);
    }
-    curr_pos = in.position();
 }

 void TabSeparatedRowInputFormat::syncAfterError()
--- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h
+++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h
@ -50,8 +50,7 @@ private:
    void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension);

    bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
-    void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
-                             ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override;
+    void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
    bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; }
 };

--- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp
@ -410,15 +410,12 @@ void TemplateRowInputFormat::writeErrorStringForWrongDelimiter(WriteBuffer & out
    out << '\n';
 }

-void TemplateRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
-                                                 ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
+void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
 {
-    prev_pos = buf.position();
    if (row_format.format_idx_to_column_idx[file_column])
        deserializeField(type, column, file_column);
    else
        skipField(row_format.formats[file_column]);
-    curr_pos = buf.position();
 }

 bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position)
--- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h
+++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h
@ -42,8 +42,7 @@ private:
    [[noreturn]] void throwUnexpectedEof();

    bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
-    void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, ReadBuffer::Position & prev_pos,
-                             ReadBuffer::Position & curr_pos) override;
+    void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
    bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override;
    void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim);

--- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp
@ -20,10 +20,10 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
-    extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
    extern const int SYNTAX_ERROR;
    extern const int TYPE_MISMATCH;
    extern const int SUPPORT_IS_DISABLED;
+    extern const int ARGUMENT_OUT_OF_BOUND;
 }


@ -167,7 +167,9 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx)
    }
    catch (const Exception & e)
    {
-        if (!isParseError(e.code()) && e.code() != ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED)
+        /// Do not consider decimal overflow as parse error to avoid attempts to parse it as expression with float literal
+        bool decimal_overflow = e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND;
+        if (!isParseError(e.code()) || decimal_overflow)
            throw;
        if (rollback_on_exception)
            column.popBack(1);
@ -226,7 +228,8 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx
        }
        catch (const Exception & e)
        {
-            if (!isParseError(e.code()))
+            bool decimal_overflow = e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND;
+            if (!isParseError(e.code()) || decimal_overflow)
                throw;
        }
        if (ok)
--- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
+++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
@ -37,8 +37,8 @@ void RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo()

 String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
 {
-    if (in.eof())        /// Buffer has gone, cannot extract information about what has been parsed.
-        return {};
+    if (in.eof())
+        return "Buffer has gone, cannot extract information about what has been parsed.";

    WriteBufferFromOwnString out;

@ -102,17 +102,17 @@ bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co
        << "type: " << alignedName(type->getName(), max_length_of_data_type_name);

    auto * prev_position = in.position();
-    auto * curr_position = in.position();
    std::exception_ptr exception;

    try
    {
-        tryDeserializeFiled(type, column, file_column, prev_position, curr_position);
+        tryDeserializeField(type, column, file_column);
    }
    catch (...)
    {
        exception = std::current_exception();
    }
+    auto * curr_position = in.position();

    if (curr_position < prev_position)
        throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
--- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
+++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
@ -24,8 +24,7 @@ protected:
                                                WriteBuffer & out, size_t file_column);

    virtual bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) = 0;
-    virtual void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
-                                     ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) = 0;
+    virtual void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) = 0;
    virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0;

    /// For convenient diagnostics in case of an error.
--- a/tests/queries/0_stateless/01195_formats_diagnostic_info.reference
+++ b/tests/queries/0_stateless/01195_formats_diagnostic_info.reference
@ -0,0 +1,29 @@
+CSV
+Column 2,   name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
+ERROR: garbage after DateTime: "7, <DOUBLE QUOTE>Hello<DOUBLE QUOTE>"
+ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.
+ERROR: There is no line feed. "1" found instead.
+ERROR: garbage after Decimal(18, 10): "Hello<LINE FEED>"
+Column 0,   name: t, type: DateTime,        ERROR: text "<LINE FEED>" is not like DateTime
+
+CustomSeparatedIgnoreSpaces
+Column 2,   name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
+ERROR: There is no delimiter before field 1: expected ",", got "7, <DOUBLE QUOTE>Hello<DOUBLE QUOTE>,"
+Column 0,   name: t, type: DateTime,        ERROR: text ",1<LINE FEED>" is not like DateTime
+Column 0,   name: t, type: DateTime,        ERROR: text "Hello<LINE FEED>" is not like DateTime
+OK
+
+TSV
+Column 2,   name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
+ERROR: garbage after DateTime: "7<TAB>Hello<TAB>12"
+ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.
+ERROR: Tab found where line feed is expected. It's like your file has more columns than expected.
+ERROR: garbage after Decimal(18, 10): "Hello<LINE FEED>"
+Column 0,   name: t, type: DateTime,        ERROR: text "<LINE FEED>" is not like DateTime
+
+CustomSeparated
+Column 2,   name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
+ERROR: There is no delimiter before field 1: expected "<TAB>", got "7<TAB>Hello<TAB>123"
+ERROR: There is no delimiter after last field: expected "<LINE FEED>", got "<TAB>1<LINE FEED>"
+ERROR: There is no delimiter after last field: expected "<LINE FEED>", got "Hello<LINE FEED>"
+Column 0,   name: t, type: DateTime,        ERROR: text "<LINE FEED>" is not like DateTime
--- a/tests/queries/0_stateless/01195_formats_diagnostic_info.sh
+++ b/tests/queries/0_stateless/01195_formats_diagnostic_info.sh
@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+. $CURDIR/../shell_config.sh
+
+PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CSV)
+echo '2020-04-21 12:34:56, "Hello", 12345678' | "${PARSER[@]}"  2>&1| grep "ERROR" || echo "CSV"
+echo '2020-04-21 12:34:56, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo '2020-04-21 12:34:567, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
+#echo '2020-04-21, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"    # DateTime parsing is unsafe, it produces unexpected result ("Hello" is parsed as time)
+echo '2020-04-21 12:34:56, "Hello", 12345678,1' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo '2020-04-21 12:34:56,,123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo -e '2020-04-21 12:34:56, "Hello", 12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
+
+PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CustomSeparatedIgnoreSpaces --format_custom_escaping_rule CSV --format_custom_field_delimiter ',' --format_custom_row_after_delimiter "")
+echo '2020-04-21 12:34:56, "Hello", 12345678' | "${PARSER[@]}"  2>&1| grep "ERROR" || echo -e  "\nCustomSeparatedIgnoreSpaces"
+echo '2020-04-21 12:34:56, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo '2020-04-21 12:34:567, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo '2020-04-21 12:34:56, "Hello", 12345678,1' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo '2020-04-21 12:34:56,,123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo -e '2020-04-21 12:34:56, "Hello", 12345678\n\n\n\n   ' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo "OK"
+
+PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format TSV)
+echo -e '2020-04-21 12:34:56\tHello\t12345678' | "${PARSER[@]}"  2>&1| grep "ERROR" || echo -e "\nTSV"
+echo -e '2020-04-21 12:34:56\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
+
+PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CustomSeparated)
+echo -e '2020-04-21 12:34:56\tHello\t12345678' | "${PARSER[@]}"  2>&1| grep "ERROR" || echo -e "\nCustomSeparated"
+echo -e '2020-04-21 12:34:56\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
+echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"