From 454eb1af7ed69d00617e6bfa3daece508fcb565b Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 22 Jul 2024 17:15:05 +0000 Subject: [PATCH 1/2] Fix invalid format detection in schema inference tmp --- src/Storages/ObjectStorage/ReadBufferIterator.cpp | 4 +++- src/Storages/StorageFile.cpp | 8 ++++++-- src/Storages/StorageURL.cpp | 4 +++- .../03023_invalid_format_detection.reference | 1 + .../0_stateless/03023_invalid_format_detection.sh | 10 ++++++++++ 5 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/03023_invalid_format_detection.reference create mode 100755 tests/queries/0_stateless/03023_invalid_format_detection.sh diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 78cdc442f64..3d428073e88 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -162,7 +162,9 @@ ReadBufferIterator::Data ReadBufferIterator::next() { for (const auto & object_info : read_keys) { - if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(object_info->getFileName())) + auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(object_info->getFileName()); + /// Use this format only if we have a schema reader for it. + if (format_from_file_name && FormatFactory::instance().checkIfFormatHasAnySchemaReader(*format_from_file_name)) { format = format_from_file_name; break; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 7f39ff615f0..b4c03fa86c6 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -416,7 +416,9 @@ namespace { for (const auto & path : paths) { - if (auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(path)) + auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(path); + /// Use this format only if we have a schema reader for it. + if (format_from_path && FormatFactory::instance().checkIfFormatHasAnySchemaReader(*format_from_path)) { format = format_from_path; break; @@ -705,7 +707,9 @@ namespace /// If format is unknown we can try to determine it by the file name. if (!format) { - if (auto format_from_file = FormatFactory::instance().tryGetFormatFromFileName(*filename)) + auto format_from_file = FormatFactory::instance().tryGetFormatFromFileName(*filename); + /// Use this format only if we have a schema reader for it. + if (format_from_file && FormatFactory::instance().checkIfFormatHasAnySchemaReader(*format_from_file)) format = format_from_file; } diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 895da028fc2..4d82c2bfadb 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -735,7 +735,9 @@ namespace { for (const auto & url : options) { - if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(url)) + auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(url); + /// Use this format only if we have a schema reader for it. + if (format_from_file_name && FormatFactory::instance().checkIfFormatHasAnySchemaReader(*format_from_file_name)) { format = format_from_file_name; break; diff --git a/tests/queries/0_stateless/03023_invalid_format_detection.reference b/tests/queries/0_stateless/03023_invalid_format_detection.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/03023_invalid_format_detection.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/03023_invalid_format_detection.sh b/tests/queries/0_stateless/03023_invalid_format_detection.sh new file mode 100755 index 00000000000..984a42aa218 --- /dev/null +++ b/tests/queries/0_stateless/03023_invalid_format_detection.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +touch $CLICKHOUSE_TEST_UNIQUE_NAME.xml +$CLICKHOUSE_LOCAL -q "select * from file('$CLICKHOUSE_TEST_UNIQUE_NAME.*')" 2>&1 | grep -c "CANNOT_DETECT_FORMAT" +rm $CLICKHOUSE_TEST_UNIQUE_NAME.xml + From ef4cb8b28324f1599c082b70e9c9c516c6c519d4 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 22 Jul 2024 17:18:20 +0000 Subject: [PATCH 2/2] Add missing check --- src/Storages/ObjectStorage/ReadBufferIterator.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 3d428073e88..df78f128c80 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include @@ -222,7 +223,9 @@ ReadBufferIterator::Data ReadBufferIterator::next() { for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) { - if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->getFileName())) + auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->getFileName()); + /// Use this format only if we have a schema reader for it. + if (format_from_file_name && FormatFactory::instance().checkIfFormatHasAnySchemaReader(*format_from_file_name)) { format = format_from_file_name; break;