From 5dd99db3692f9e2fb7264275c43385e274467674 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 8 Aug 2023 03:55:06 +0200 Subject: [PATCH 1/2] Add diagnostic info about file name during schema inference --- src/Formats/ReadSchemaUtils.cpp | 16 ++++++++++++---- src/Processors/Formats/IRowInputFormat.cpp | 16 +++++++++++++++- ...le_diagnostics_while_reading_header.reference | 2 ++ ...2836_file_diagnostics_while_reading_header.sh | 11 +++++++++++ 4 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/02836_file_diagnostics_while_reading_header.reference create mode 100755 tests/queries/0_stateless/02836_file_diagnostics_while_reading_header.sh diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index cbfa05bfba6..d183442ba44 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -1,13 +1,11 @@ -#include -#include #include -#include -#include #include #include #include #include #include +#include + namespace DB { @@ -53,6 +51,7 @@ ColumnsDescription readSchemaFromFormat( bool retry, ContextPtr & context, std::unique_ptr & buf) +try { NamesAndTypesList names_and_types; if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name)) @@ -209,12 +208,21 @@ ColumnsDescription readSchemaFromFormat( ErrorCodes::BAD_ARGUMENTS, "{} file format doesn't support schema inference. You must specify the structure manually", format_name); + /// Some formats like CSVWithNames can contain empty column names. We don't support empty column names and further processing can fail with an exception. Let's just remove columns with empty names from the structure. names_and_types.erase( std::remove_if(names_and_types.begin(), names_and_types.end(), [](const NameAndTypePair & pair) { return pair.name.empty(); }), names_and_types.end()); return ColumnsDescription(names_and_types); } +catch (Exception & e) +{ + auto file_name = getFileNameFromReadBuffer(*buf); + if (!file_name.empty()) + e.addMessage(fmt::format("(in file/uri {})", file_name)); + throw; +} + ColumnsDescription readSchemaFromFormat( const String & format_name, diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp index 8bcf293033b..7395d9b916a 100644 --- a/src/Processors/Formats/IRowInputFormat.cpp +++ b/src/Processors/Formats/IRowInputFormat.cpp @@ -86,7 +86,21 @@ void IRowInputFormat::logError() Chunk IRowInputFormat::generate() { if (total_rows == 0) - readPrefix(); + { + try + { + readPrefix(); + } + catch (Exception & e) + { + auto file_name = getFileNameFromReadBuffer(getReadBuffer()); + if (!file_name.empty()) + e.addMessage(fmt::format("(in file/uri {})", file_name)); + + e.addMessage("(while reading header)"); + throw; + } + } const Block & header = getPort().getHeader(); diff --git a/tests/queries/0_stateless/02836_file_diagnostics_while_reading_header.reference b/tests/queries/0_stateless/02836_file_diagnostics_while_reading_header.reference new file mode 100644 index 00000000000..6829f972684 --- /dev/null +++ b/tests/queries/0_stateless/02836_file_diagnostics_while_reading_header.reference @@ -0,0 +1,2 @@ +in file/uri +test.csv diff --git a/tests/queries/0_stateless/02836_file_diagnostics_while_reading_header.sh b/tests/queries/0_stateless/02836_file_diagnostics_while_reading_header.sh new file mode 100755 index 00000000000..d1b5ffa2af8 --- /dev/null +++ b/tests/queries/0_stateless/02836_file_diagnostics_while_reading_header.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +FILENAME="${CLICKHOUSE_TMP}/test.csv" + +printf 'Bad\rHeader\n123\n' > "${FILENAME}" +${CLICKHOUSE_LOCAL} --query "SELECT * FROM file('${CLICKHOUSE_TMP}/t*e*s*t.csv')" 2>&1 | grep -o -P 'in file/uri|test\.csv' +rm "${FILENAME}" From 29221188baaa6b601824f47319ad888462dbf296 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 9 Aug 2023 04:07:31 +0200 Subject: [PATCH 2/2] Fix error --- src/Formats/ReadSchemaUtils.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index d183442ba44..b40fb20dadf 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -217,6 +217,8 @@ try } catch (Exception & e) { + if (!buf) + throw; auto file_name = getFileNameFromReadBuffer(*buf); if (!file_name.empty()) e.addMessage(fmt::format("(in file/uri {})", file_name));