mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 07:01:59 +00:00
better diagnostic info in input formats
This commit is contained in:
parent
388bcffec9
commit
53dcce55e9
@ -33,17 +33,14 @@ inline bool readDigits(ReadBuffer & buf, T & x, unsigned int & digits, int & exp
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!buf.eof())
|
||||
switch (*buf.position())
|
||||
{
|
||||
switch (*buf.position())
|
||||
{
|
||||
case '-':
|
||||
sign = -1;
|
||||
[[fallthrough]];
|
||||
case '+':
|
||||
++buf.position();
|
||||
break;
|
||||
}
|
||||
case '-':
|
||||
sign = -1;
|
||||
[[fallthrough]];
|
||||
case '+':
|
||||
++buf.position();
|
||||
break;
|
||||
}
|
||||
|
||||
bool stop = false;
|
||||
|
@ -18,6 +18,8 @@ namespace ErrorCodes
|
||||
extern const int CANNOT_PARSE_UUID;
|
||||
extern const int TOO_LARGE_STRING_SIZE;
|
||||
extern const int INCORRECT_NUMBER_OF_COLUMNS;
|
||||
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||
extern const int INCORRECT_DATA;
|
||||
}
|
||||
|
||||
|
||||
@ -30,7 +32,9 @@ bool isParseError(int code)
|
||||
|| code == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT
|
||||
|| code == ErrorCodes::CANNOT_PARSE_NUMBER
|
||||
|| code == ErrorCodes::CANNOT_PARSE_UUID
|
||||
|| code == ErrorCodes::TOO_LARGE_STRING_SIZE;
|
||||
|| code == ErrorCodes::TOO_LARGE_STRING_SIZE
|
||||
|| code == ErrorCodes::ARGUMENT_OUT_OF_BOUND /// For Decimals
|
||||
|| code == ErrorCodes::INCORRECT_DATA; /// For some ReadHelpers
|
||||
}
|
||||
|
||||
|
||||
@ -127,6 +131,10 @@ Chunk IRowInputFormat::generate()
|
||||
{
|
||||
verbose_diagnostic = getDiagnosticInfo();
|
||||
}
|
||||
catch (const Exception & exception)
|
||||
{
|
||||
verbose_diagnostic = "Cannot get verbose diagnostic: " + exception.message();
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
/// Error while trying to obtain verbose diagnostic. Ok to ignore.
|
||||
|
@ -273,6 +273,7 @@ bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
||||
return false;
|
||||
}
|
||||
|
||||
skipWhitespacesAndTabs(in);
|
||||
if (column_indexes_for_input_fields[file_column].has_value())
|
||||
{
|
||||
const auto & header = getPort().getHeader();
|
||||
@ -289,6 +290,7 @@ bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
||||
if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column))
|
||||
return false;
|
||||
}
|
||||
skipWhitespacesAndTabs(in);
|
||||
|
||||
/// Delimiters
|
||||
if (file_column + 1 == column_indexes_for_input_fields.size())
|
||||
@ -351,12 +353,8 @@ void CSVRowInputFormat::syncAfterError()
|
||||
skipToNextLineOrEOF(in);
|
||||
}
|
||||
|
||||
void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
|
||||
void CSVRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
|
||||
{
|
||||
skipWhitespacesAndTabs(in);
|
||||
prev_pos = in.position();
|
||||
|
||||
if (column_indexes_for_input_fields[file_column])
|
||||
{
|
||||
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
|
||||
@ -367,9 +365,6 @@ void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn &
|
||||
String tmp;
|
||||
readCSVString(tmp, in, format_settings.csv);
|
||||
}
|
||||
|
||||
curr_pos = in.position();
|
||||
skipWhitespacesAndTabs(in);
|
||||
}
|
||||
|
||||
bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column)
|
||||
|
@ -55,11 +55,10 @@ private:
|
||||
void addInputColumn(const String & column_name);
|
||||
|
||||
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
|
||||
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override;
|
||||
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
|
||||
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override
|
||||
{
|
||||
return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter;
|
||||
return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter && *pos != ' ' && *pos != '\t';
|
||||
}
|
||||
|
||||
bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column);
|
||||
|
@ -318,10 +318,8 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns &
|
||||
return true;
|
||||
}
|
||||
|
||||
void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
|
||||
void TabSeparatedRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
|
||||
{
|
||||
prev_pos = in.position();
|
||||
if (column_indexes_for_input_fields[file_column])
|
||||
{
|
||||
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
|
||||
@ -332,7 +330,6 @@ void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, I
|
||||
NullSink null_sink;
|
||||
readEscapedStringInto(null_sink, in);
|
||||
}
|
||||
curr_pos = in.position();
|
||||
}
|
||||
|
||||
void TabSeparatedRowInputFormat::syncAfterError()
|
||||
|
@ -50,8 +50,7 @@ private:
|
||||
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension);
|
||||
|
||||
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
|
||||
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override;
|
||||
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
|
||||
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; }
|
||||
};
|
||||
|
||||
|
@ -410,15 +410,12 @@ void TemplateRowInputFormat::writeErrorStringForWrongDelimiter(WriteBuffer & out
|
||||
out << '\n';
|
||||
}
|
||||
|
||||
void TemplateRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
|
||||
void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
|
||||
{
|
||||
prev_pos = buf.position();
|
||||
if (row_format.format_idx_to_column_idx[file_column])
|
||||
deserializeField(type, column, file_column);
|
||||
else
|
||||
skipField(row_format.formats[file_column]);
|
||||
curr_pos = buf.position();
|
||||
}
|
||||
|
||||
bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position)
|
||||
|
@ -42,8 +42,7 @@ private:
|
||||
[[noreturn]] void throwUnexpectedEof();
|
||||
|
||||
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
|
||||
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, ReadBuffer::Position & prev_pos,
|
||||
ReadBuffer::Position & curr_pos) override;
|
||||
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
|
||||
bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override;
|
||||
void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim);
|
||||
|
||||
|
@ -20,10 +20,10 @@ namespace DB
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
|
||||
extern const int SYNTAX_ERROR;
|
||||
extern const int TYPE_MISMATCH;
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||
}
|
||||
|
||||
|
||||
@ -167,7 +167,9 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx)
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
if (!isParseError(e.code()) && e.code() != ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED)
|
||||
/// Do not consider decimal overflow as parse error to avoid attempts to parse it as expression with float literal
|
||||
bool decimal_overflow = e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND;
|
||||
if (!isParseError(e.code()) || decimal_overflow)
|
||||
throw;
|
||||
if (rollback_on_exception)
|
||||
column.popBack(1);
|
||||
@ -226,7 +228,8 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
if (!isParseError(e.code()))
|
||||
bool decimal_overflow = e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND;
|
||||
if (!isParseError(e.code()) || decimal_overflow)
|
||||
throw;
|
||||
}
|
||||
if (ok)
|
||||
|
@ -37,8 +37,8 @@ void RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo()
|
||||
|
||||
String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
|
||||
{
|
||||
if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
|
||||
return {};
|
||||
if (in.eof())
|
||||
return "Buffer has gone, cannot extract information about what has been parsed.";
|
||||
|
||||
WriteBufferFromOwnString out;
|
||||
|
||||
@ -102,17 +102,17 @@ bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co
|
||||
<< "type: " << alignedName(type->getName(), max_length_of_data_type_name);
|
||||
|
||||
auto * prev_position = in.position();
|
||||
auto * curr_position = in.position();
|
||||
std::exception_ptr exception;
|
||||
|
||||
try
|
||||
{
|
||||
tryDeserializeFiled(type, column, file_column, prev_position, curr_position);
|
||||
tryDeserializeField(type, column, file_column);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
exception = std::current_exception();
|
||||
}
|
||||
auto * curr_position = in.position();
|
||||
|
||||
if (curr_position < prev_position)
|
||||
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
|
||||
|
@ -24,8 +24,7 @@ protected:
|
||||
WriteBuffer & out, size_t file_column);
|
||||
|
||||
virtual bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) = 0;
|
||||
virtual void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) = 0;
|
||||
virtual void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) = 0;
|
||||
virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0;
|
||||
|
||||
/// For convenient diagnostics in case of an error.
|
||||
|
@ -0,0 +1,29 @@
|
||||
CSV
|
||||
Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
|
||||
ERROR: garbage after DateTime: "7, <DOUBLE QUOTE>Hello<DOUBLE QUOTE>"
|
||||
ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.
|
||||
ERROR: There is no line feed. "1" found instead.
|
||||
ERROR: garbage after Decimal(18, 10): "Hello<LINE FEED>"
|
||||
Column 0, name: t, type: DateTime, ERROR: text "<LINE FEED>" is not like DateTime
|
||||
|
||||
CustomSeparatedIgnoreSpaces
|
||||
Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
|
||||
ERROR: There is no delimiter before field 1: expected ",", got "7, <DOUBLE QUOTE>Hello<DOUBLE QUOTE>,"
|
||||
Column 0, name: t, type: DateTime, ERROR: text ",1<LINE FEED>" is not like DateTime
|
||||
Column 0, name: t, type: DateTime, ERROR: text "Hello<LINE FEED>" is not like DateTime
|
||||
OK
|
||||
|
||||
TSV
|
||||
Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
|
||||
ERROR: garbage after DateTime: "7<TAB>Hello<TAB>12"
|
||||
ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.
|
||||
ERROR: Tab found where line feed is expected. It's like your file has more columns than expected.
|
||||
ERROR: garbage after Decimal(18, 10): "Hello<LINE FEED>"
|
||||
Column 0, name: t, type: DateTime, ERROR: text "<LINE FEED>" is not like DateTime
|
||||
|
||||
CustomSeparated
|
||||
Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
|
||||
ERROR: There is no delimiter before field 1: expected "<TAB>", got "7<TAB>Hello<TAB>123"
|
||||
ERROR: There is no delimiter after last field: expected "<LINE FEED>", got "<TAB>1<LINE FEED>"
|
||||
ERROR: There is no delimiter after last field: expected "<LINE FEED>", got "Hello<LINE FEED>"
|
||||
Column 0, name: t, type: DateTime, ERROR: text "<LINE FEED>" is not like DateTime
|
37
tests/queries/0_stateless/01195_formats_diagnostic_info.sh
Executable file
37
tests/queries/0_stateless/01195_formats_diagnostic_info.sh
Executable file
@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
. $CURDIR/../shell_config.sh
|
||||
|
||||
PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CSV)
|
||||
echo '2020-04-21 12:34:56, "Hello", 12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo "CSV"
|
||||
echo '2020-04-21 12:34:56, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo '2020-04-21 12:34:567, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
#echo '2020-04-21, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR" # DateTime parsing is unsafe, it produces unexpected result ("Hello" is parsed as time)
|
||||
echo '2020-04-21 12:34:56, "Hello", 12345678,1' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo '2020-04-21 12:34:56,,123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo -e '2020-04-21 12:34:56, "Hello", 12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
|
||||
PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CustomSeparatedIgnoreSpaces --format_custom_escaping_rule CSV --format_custom_field_delimiter ',' --format_custom_row_after_delimiter "")
|
||||
echo '2020-04-21 12:34:56, "Hello", 12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo -e "\nCustomSeparatedIgnoreSpaces"
|
||||
echo '2020-04-21 12:34:56, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo '2020-04-21 12:34:567, "Hello", 123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo '2020-04-21 12:34:56, "Hello", 12345678,1' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo '2020-04-21 12:34:56,,123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo -e '2020-04-21 12:34:56, "Hello", 12345678\n\n\n\n ' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo "OK"
|
||||
|
||||
PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format TSV)
|
||||
echo -e '2020-04-21 12:34:56\tHello\t12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo -e "\nTSV"
|
||||
echo -e '2020-04-21 12:34:56\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
|
||||
PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format CustomSeparated)
|
||||
echo -e '2020-04-21 12:34:56\tHello\t12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo -e "\nCustomSeparated"
|
||||
echo -e '2020-04-21 12:34:56\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
||||
echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
|
Loading…
Reference in New Issue
Block a user