Merge pull request #53135 from ClickHouse/file_diagnostics_while_reading_header

Add diagnostic info about file name during schema inference
This commit is contained in:
Alexey Milovidov 2023-08-10 21:56:12 +03:00 committed by GitHub
commit fd7b92e90a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 5 deletions

View File

@ -1,13 +1,11 @@
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeTuple.h>
#include <Formats/ReadSchemaUtils.h>
#include <Interpreters/Context.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Storages/IStorage.h>
#include <Common/assert_cast.h>
#include <IO/WithFileName.h>
namespace DB
{
@ -53,6 +51,7 @@ ColumnsDescription readSchemaFromFormat(
bool retry,
ContextPtr & context,
std::unique_ptr<ReadBuffer> & buf)
try
{
NamesAndTypesList names_and_types;
if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name))
@ -209,12 +208,23 @@ ColumnsDescription readSchemaFromFormat(
ErrorCodes::BAD_ARGUMENTS,
"{} file format doesn't support schema inference. You must specify the structure manually",
format_name);
/// Some formats like CSVWithNames can contain empty column names. We don't support empty column names and further processing can fail with an exception. Let's just remove columns with empty names from the structure.
names_and_types.erase(
std::remove_if(names_and_types.begin(), names_and_types.end(), [](const NameAndTypePair & pair) { return pair.name.empty(); }),
names_and_types.end());
return ColumnsDescription(names_and_types);
}
catch (Exception & e)
{
if (!buf)
throw;
auto file_name = getFileNameFromReadBuffer(*buf);
if (!file_name.empty())
e.addMessage(fmt::format("(in file/uri {})", file_name));
throw;
}
ColumnsDescription readSchemaFromFormat(
const String & format_name,

View File

@ -86,7 +86,21 @@ void IRowInputFormat::logError()
Chunk IRowInputFormat::generate()
{
if (total_rows == 0)
readPrefix();
{
try
{
readPrefix();
}
catch (Exception & e)
{
auto file_name = getFileNameFromReadBuffer(getReadBuffer());
if (!file_name.empty())
e.addMessage(fmt::format("(in file/uri {})", file_name));
e.addMessage("(while reading header)");
throw;
}
}
const Block & header = getPort().getHeader();

View File

@ -0,0 +1,2 @@
in file/uri
test.csv

View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
FILENAME="${CLICKHOUSE_TMP}/test.csv"
printf 'Bad\rHeader\n123\n' > "${FILENAME}"
${CLICKHOUSE_LOCAL} --query "SELECT * FROM file('${CLICKHOUSE_TMP}/t*e*s*t.csv')" 2>&1 | grep -o -P 'in file/uri|test\.csv'
rm "${FILENAME}"