diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 5576da56dbf..f97df25aba7 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -226,10 +226,12 @@ try continue; } - SchemaReaderPtr schema_reader; + std::unique_ptr peekable_buf; /// Can be used in format detection. Should be destroyed after schema reader. if (format_name) { + SchemaReaderPtr schema_reader; + try { schema_reader = FormatFactory::instance().getSchemaReader(*format_name, *iterator_data.buf, context, format_settings); @@ -296,7 +298,6 @@ try /// to high memory usage as it will save all the read data from the beginning of the file, /// especially it will be noticeable for formats like Parquet/ORC/Arrow that do seeks to the /// end of file. - std::unique_ptr peekable_buf; bool support_buf_recreation = read_buffer_iterator.supportsLastReadBufferRecreation(); if (!support_buf_recreation) { @@ -310,7 +311,7 @@ try { try { - schema_reader = FormatFactory::instance().getSchemaReader(format_to_detect, support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); + SchemaReaderPtr schema_reader = FormatFactory::instance().getSchemaReader(format_to_detect, support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); names_and_types = schema_reader->readSchema(); if (names_and_types.empty()) @@ -355,7 +356,7 @@ try { try { - schema_reader = FormatFactory::instance().getSchemaReader( + SchemaReaderPtr schema_reader = FormatFactory::instance().getSchemaReader( formats_set_to_detect[i], support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); auto tmp_names_and_types = schema_reader->readSchema(); diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index a846e9fd9ef..59eba6505f3 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -1147,7 +1147,7 @@ void registerStorageHDFS(StorageFactory & factory) } if (format_name == "auto") - format_name = FormatFactory::instance().getFormatFromFileName(url); + format_name = FormatFactory::instance().tryGetFormatFromFileName(url).value_or("auto"); String compression_method; if (engine_args.size() == 3) diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 35072dc5cae..c55725ce940 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -144,7 +144,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); return configuration; } @@ -237,7 +237,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); return configuration; } @@ -1316,10 +1316,28 @@ namespace Data next() override { /// For default mode check cached columns for currently read keys on first iteration. - if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (first) { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns, format}; + /// If format is unknown we iterate through all currently read keys on first iteration and + /// try to determine format by file name. + if (!format) + { + for (const auto & key : read_keys) + { + if (auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(key.relative_path)) + { + format = format_from_path; + break; + } + } + } + + /// For default mode check cached columns for currently read keys on first iteration. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + return {nullptr, cached_columns, format}; + } } current_path_with_metadata = file_iterator->next(); @@ -1345,15 +1363,33 @@ namespace first = false; - /// AzureBlobStorage file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT && read_keys.size() > prev_read_keys_size) + /// AzureBlobStorage file iterator could get new keys after new iteration. + if (read_keys.size() > prev_read_keys_size) { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + /// If format is unknown we can try to determine it by new file names. + if (!format) + { + for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it).relative_path)) + { + format = format_from_file_name; + break; + } + } + } + /// Check new files in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + if (columns_from_cache) + return {nullptr, columns_from_cache, format}; + } + prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache, format}; } - else if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) + + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { RelativePathsWithMetadata paths = {current_path_with_metadata}; if (auto columns_from_cache = tryGetColumnsFromCache(paths.begin(), paths.end())) @@ -1520,7 +1556,7 @@ ColumnsDescription StorageAzureBlob::getTableStructureFromData( const std::optional & format_settings, const DB::ContextPtr & ctx) { - return getTableStructureAndFormatFromDataImpl(std::nullopt, object_storage, configuration, format_settings, ctx).first; + return getTableStructureAndFormatFromDataImpl(configuration.format, object_storage, configuration, format_settings, ctx).first; } SchemaCache & StorageAzureBlob::getSchemaCache(const ContextPtr & ctx) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 8e5b6040a63..f9c7400edfb 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -1702,30 +1702,33 @@ namespace return {nullptr, std::nullopt, format}; } - /// S3 file iterator could get new keys after new iteration, if format is unknown we can try to determine it by new file names. - if (!format && read_keys.size() > prev_read_keys_size) + /// S3 file iterator could get new keys after new iteration + if (read_keys.size() > prev_read_keys_size) { - for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) + /// If format is unknown we can try to determine it by new file names. + if (!format) { - if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->key)) + for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) { - format = format_from_file_name; - break; + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->key)) + { + format = format_from_file_name; + break; + } } } - } - /// S3 file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT && read_keys.size() > prev_read_keys_size) - { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + /// Check new files in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + if (columns_from_cache) + return {nullptr, columns_from_cache, format}; + } + prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache, format}; } - prev_read_keys_size = read_keys.size(); - if (getContext()->getSettingsRef().s3_skip_empty_files && current_key_with_info->info && current_key_with_info->info->size == 0) continue; diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index b9e0af53b7b..8a537e154db 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -58,7 +58,7 @@ void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); } else { diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 8dee15f4d94..165dfb212b7 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -600,7 +600,7 @@ def test_schema_inference_with_globs(started_cluster): ) assert ( - "Cannot extract table structure from JSONCompactEachRow format file" in result + "CANNOT_EXTRACT_TABLE_STRUCTURE" in result ) @@ -1044,7 +1044,7 @@ def test_union_schema_inference_mode(started_cluster): error = node.query_and_get_error( "desc hdfs('hdfs://hdfs1:9000/test_union_schema_inference*.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" ) - assert "Cannot extract table structure" in error + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error def test_format_detection(started_cluster): diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 365ade7da65..0b5e9462860 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1380,7 +1380,7 @@ def test_schema_inference_from_globs(started_cluster): ) assert ( - "Cannot extract table structure from JSONCompactEachRow format file" in result + "CANNOT_EXTRACT_TABLE_STRUCTURE" in result ) url_filename = "test{0,1,2,3}.jsoncompacteachrow" @@ -1390,7 +1390,7 @@ def test_schema_inference_from_globs(started_cluster): ) assert ( - "Cannot extract table structure from JSONCompactEachRow format file" in result + "CANNOT_EXTRACT_TABLE_STRUCTURE" in result ) @@ -2193,7 +2193,7 @@ def test_union_schema_inference_mode(started_cluster): error = instance.query_and_get_error( f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3,4}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" ) - assert "Cannot extract table structure" in error + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error def test_s3_format_detection(started_cluster): diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh index 71ccee6f5f4..d62f928e947 100755 --- a/tests/queries/0_stateless/02725_database_hdfs.sh +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -58,7 +58,7 @@ SELECT * FROM \"abacaba/file.tsv\" """ 2>&1 | tr '\n' ' ' | grep -oF "CANNOT_EXTRACT_TABLE_STRUCTURE" ${CLICKHOUSE_CLIENT} -q "SELECT * FROM test_hdfs_4.\`http://localhost:11111/test/a.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "HDFS_ERROR" > /dev/null && echo "OK" || echo 'FAIL' ||: +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: