better diagnostic info in input formats

This commit is contained in:
Alexander Tokmakov 2020-04-21 23:41:52 +03:00
parent 388bcffec9
commit 53dcce55e9
13 changed files with 102 additions and 43 deletions

View File

@ -33,17 +33,14 @@ inline bool readDigits(ReadBuffer & buf, T & x, unsigned int & digits, int & exp
return false;
}
if (!buf.eof())
switch (*buf.position())
{
switch (*buf.position())
{
case '-':
sign = -1;
[[fallthrough]];
case '+':
++buf.position();
break;
}
case '-':
sign = -1;
[[fallthrough]];
case '+':
++buf.position();
break;
}
bool stop = false;

View File

@ -18,6 +18,8 @@ namespace ErrorCodes
extern const int CANNOT_PARSE_UUID;
extern const int TOO_LARGE_STRING_SIZE;
extern const int INCORRECT_NUMBER_OF_COLUMNS;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int INCORRECT_DATA;
}
@ -30,7 +32,9 @@ bool isParseError(int code)
|| code == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT
|| code == ErrorCodes::CANNOT_PARSE_NUMBER
|| code == ErrorCodes::CANNOT_PARSE_UUID
|| code == ErrorCodes::TOO_LARGE_STRING_SIZE;
|| code == ErrorCodes::TOO_LARGE_STRING_SIZE
|| code == ErrorCodes::ARGUMENT_OUT_OF_BOUND /// For Decimals
|| code == ErrorCodes::INCORRECT_DATA; /// For some ReadHelpers
}
@ -127,6 +131,10 @@ Chunk IRowInputFormat::generate()
{
verbose_diagnostic = getDiagnosticInfo();
}
catch (const Exception & exception)
{
verbose_diagnostic = "Cannot get verbose diagnostic: " + exception.message();
}
catch (...)
{
/// Error while trying to obtain verbose diagnostic. Ok to ignore.

View File

@ -273,6 +273,7 @@ bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
return false;
}
skipWhitespacesAndTabs(in);
if (column_indexes_for_input_fields[file_column].has_value())
{
const auto & header = getPort().getHeader();
@ -289,6 +290,7 @@ bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column))
return false;
}
skipWhitespacesAndTabs(in);
/// Delimiters
if (file_column + 1 == column_indexes_for_input_fields.size())
@ -351,12 +353,8 @@ void CSVRowInputFormat::syncAfterError()
skipToNextLineOrEOF(in);
}
void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
void CSVRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
{
skipWhitespacesAndTabs(in);
prev_pos = in.position();
if (column_indexes_for_input_fields[file_column])
{
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
@ -367,9 +365,6 @@ void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn &
String tmp;
readCSVString(tmp, in, format_settings.csv);
}
curr_pos = in.position();
skipWhitespacesAndTabs(in);
}
bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column)

View File

@ -55,11 +55,10 @@ private:
void addInputColumn(const String & column_name);
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override;
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override
{
return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter;
return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter && *pos != ' ' && *pos != '\t';
}
bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column);

View File

@ -318,10 +318,8 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns &
return true;
}
void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
void TabSeparatedRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
{
prev_pos = in.position();
if (column_indexes_for_input_fields[file_column])
{
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
@ -332,7 +330,6 @@ void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, I
NullSink null_sink;
readEscapedStringInto(null_sink, in);
}
curr_pos = in.position();
}
void TabSeparatedRowInputFormat::syncAfterError()

View File

@ -50,8 +50,7 @@ private:
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension);
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override;
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; }
};

View File

@ -410,15 +410,12 @@ void TemplateRowInputFormat::writeErrorStringForWrongDelimiter(WriteBuffer & out
out << '\n';
}
void TemplateRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
{
prev_pos = buf.position();
if (row_format.format_idx_to_column_idx[file_column])
deserializeField(type, column, file_column);
else
skipField(row_format.formats[file_column]);
curr_pos = buf.position();
}
bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position)

View File

@ -42,8 +42,7 @@ private:
[[noreturn]] void throwUnexpectedEof();
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, ReadBuffer::Position & prev_pos,
ReadBuffer::Position & curr_pos) override;
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override;
void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim);

View File

@ -20,10 +20,10 @@ namespace DB
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
extern const int SYNTAX_ERROR;
extern const int TYPE_MISMATCH;
extern const int SUPPORT_IS_DISABLED;
extern const int ARGUMENT_OUT_OF_BOUND;
}
@ -167,7 +167,9 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx)
}
catch (const Exception & e)
{
if (!isParseError(e.code()) && e.code() != ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED)
/// Do not consider decimal overflow as parse error to avoid attempts to parse it as expression with float literal
bool decimal_overflow = e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND;
if (!isParseError(e.code()) || decimal_overflow)
throw;
if (rollback_on_exception)
column.popBack(1);
@ -226,7 +228,8 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx
}
catch (const Exception & e)
{
if (!isParseError(e.code()))
bool decimal_overflow = e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND;
if (!isParseError(e.code()) || decimal_overflow)
throw;
}
if (ok)

View File

@ -37,8 +37,8 @@ void RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo()
String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
{
if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
return {};
if (in.eof())
return "Buffer has gone, cannot extract information about what has been parsed.";
WriteBufferFromOwnString out;
@ -102,17 +102,17 @@ bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co
<< "type: " << alignedName(type->getName(), max_length_of_data_type_name);
auto * prev_position = in.position();
auto * curr_position = in.position();
std::exception_ptr exception;
try
{
tryDeserializeFiled(type, column, file_column, prev_position, curr_position);
tryDeserializeField(type, column, file_column);
}
catch (...)
{
exception = std::current_exception();
}
auto * curr_position = in.position();
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);

View File

@ -24,8 +24,7 @@ protected:
WriteBuffer & out, size_t file_column);
virtual bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) = 0;
virtual void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) = 0;
virtual void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) = 0;
virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0;
/// For convenient diagnostics in case of an error.

View File

@ -0,0 +1,29 @@
CSV
Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
ERROR: garbage after DateTime: "7, <DOUBLE QUOTE>Hello<DOUBLE QUOTE>"
ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.
ERROR: There is no line feed. "1" found instead.
ERROR: garbage after Decimal(18, 10): "Hello<LINE FEED>"
Column 0, name: t, type: DateTime, ERROR: text "<LINE FEED>" is not like DateTime
CustomSeparatedIgnoreSpaces
Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
ERROR: There is no delimiter before field 1: expected ",", got "7, <DOUBLE QUOTE>Hello<DOUBLE QUOTE>,"
Column 0, name: t, type: DateTime, ERROR: text ",1<LINE FEED>" is not like DateTime
Column 0, name: t, type: DateTime, ERROR: text "Hello<LINE FEED>" is not like DateTime
OK
TSV
Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
ERROR: garbage after DateTime: "7<TAB>Hello<TAB>12"
ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.
ERROR: Tab found where line feed is expected. It's like your file has more columns than expected.
ERROR: garbage after Decimal(18, 10): "Hello<LINE FEED>"
Column 0, name: t, type: DateTime, ERROR: text "<LINE FEED>" is not like DateTime
CustomSeparated
Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
ERROR: There is no delimiter before field 1: expected "<TAB>", got "7<TAB>Hello<TAB>123"
ERROR: There is no delimiter after last field: expected "<LINE FEED>", got "<TAB>1<LINE FEED>"
ERROR: There is no delimiter after last field: expected "<LINE FEED>", got "Hello<LINE FEED>"
Column 0, name: t, type: DateTime, ERROR: text "<LINE FEED>" is not like DateTime

View File

@ -0,0 +1,37 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CSV)
echo '2020-04-21 12:34:56, "Hello", 12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo "CSV"
echo '2020-04-21 12:34:56, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo '2020-04-21 12:34:567, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
#echo '2020-04-21, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" # DateTime parsing is unsafe, it produces unexpected result ("Hello" is parsed as time)
echo '2020-04-21 12:34:56, "Hello", 12345678,1' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo '2020-04-21 12:34:56,,123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:56, "Hello", 12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CustomSeparatedIgnoreSpaces --format_custom_escaping_rule CSV --format_custom_field_delimiter ',' --format_custom_row_after_delimiter "")
echo '2020-04-21 12:34:56, "Hello", 12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo -e "\nCustomSeparatedIgnoreSpaces"
echo '2020-04-21 12:34:56, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo '2020-04-21 12:34:567, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo '2020-04-21 12:34:56, "Hello", 12345678,1' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo '2020-04-21 12:34:56,,123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:56, "Hello", 12345678\n\n\n\n ' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo "OK"
PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format TSV)
echo -e '2020-04-21 12:34:56\tHello\t12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo -e "\nTSV"
echo -e '2020-04-21 12:34:56\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CustomSeparated)
echo -e '2020-04-21 12:34:56\tHello\t12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo -e "\nCustomSeparated"
echo -e '2020-04-21 12:34:56\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"