Merge pull request #71947 from ClickHouse/fix_weird_problem

Fix weird case when `s3`/`s3Cluster` return incomplete result or exception
This commit is contained in:
Kseniia Sumarokova 2024-11-20 16:37:58 +00:00 committed by GitHub
commit c6a10151d9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 49 additions and 5 deletions

View File

@ -258,7 +258,7 @@ CREATE TABLE table_with_asterisk (name String, value UInt32)
- [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default. - [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default.
- [s3_create_new_file_on_insert](/docs/en/operations/settings/settings.md#s3_create_new_file_on_insert) - allows to create a new file on each insert if format has suffix. Disabled by default. - [s3_create_new_file_on_insert](/docs/en/operations/settings/settings.md#s3_create_new_file_on_insert) - allows to create a new file on each insert if format has suffix. Disabled by default.
- [s3_skip_empty_files](/docs/en/operations/settings/settings.md#s3_skip_empty_files) - allows to skip empty files while reading. Disabled by default. - [s3_skip_empty_files](/docs/en/operations/settings/settings.md#s3_skip_empty_files) - allows to skip empty files while reading. Enabled by default.
## S3-related Settings {#settings} ## S3-related Settings {#settings}

View File

@ -317,7 +317,7 @@ SELECT * from s3('s3://data/path/date=*/country=*/code=*/*.parquet') where _date
- [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default. - [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default.
- [s3_create_new_file_on_insert](/docs/en/operations/settings/settings.md#s3_create_new_file_on_insert) - allows to create a new file on each insert if format has suffix. Disabled by default. - [s3_create_new_file_on_insert](/docs/en/operations/settings/settings.md#s3_create_new_file_on_insert) - allows to create a new file on each insert if format has suffix. Disabled by default.
- [s3_skip_empty_files](/docs/en/operations/settings/settings.md#s3_skip_empty_files) - allows to skip empty files while reading. Disabled by default. - [s3_skip_empty_files](/docs/en/operations/settings/settings.md#s3_skip_empty_files) - allows to skip empty files while reading. Enabled by default.
**See Also** **See Also**

View File

@ -433,7 +433,7 @@ Possible values:
- 0 `INSERT` query appends new data to the end of the file. - 0 `INSERT` query appends new data to the end of the file.
- 1 `INSERT` query creates a new file. - 1 `INSERT` query creates a new file.
)", 0) \ )", 0) \
DECLARE(Bool, s3_skip_empty_files, false, R"( DECLARE(Bool, s3_skip_empty_files, true, R"(
Enables or disables skipping empty files in [S3](../../engines/table-engines/integrations/s3.md) engine tables. Enables or disables skipping empty files in [S3](../../engines/table-engines/integrations/s3.md) engine tables.
Possible values: Possible values:

View File

@ -87,6 +87,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"filesystem_cache_skip_download_if_exceeds_per_query_cache_write_limit", 1, 1, "Rename of setting skip_download_if_exceeds_query_cache_limit"}, {"filesystem_cache_skip_download_if_exceeds_per_query_cache_write_limit", 1, 1, "Rename of setting skip_download_if_exceeds_query_cache_limit"},
{"filesystem_cache_prefer_bigger_buffer_size", true, true, "New setting"}, {"filesystem_cache_prefer_bigger_buffer_size", true, true, "New setting"},
{"read_in_order_use_virtual_row", false, false, "Use virtual row while reading in order of primary key or its monotonic function fashion. It is useful when searching over multiple parts as only relevant ones are touched."}, {"read_in_order_use_virtual_row", false, false, "Use virtual row while reading in order of primary key or its monotonic function fashion. It is useful when searching over multiple parts as only relevant ones are touched."},
{"s3_skip_empty_files", false, true, "We hope it will provide better UX"},
{"filesystem_cache_boundary_alignment", 0, 0, "New setting"}, {"filesystem_cache_boundary_alignment", 0, 0, "New setting"},
{"push_external_roles_in_interserver_queries", false, false, "New setting."}, {"push_external_roles_in_interserver_queries", false, false, "New setting."},
} }

View File

@ -218,7 +218,6 @@ ReadBufferIterator::Data ReadBufferIterator::next()
} }
const auto filename = current_object_info->getFileName(); const auto filename = current_object_info->getFileName();
chassert(!filename.empty());
/// file iterator could get new keys after new iteration /// file iterator could get new keys after new iteration
if (read_keys.size() > prev_read_keys_size) if (read_keys.size() > prev_read_keys_size)

View File

@ -306,7 +306,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
{ {
object_info = file_iterator->next(processor); object_info = file_iterator->next(processor);
if (!object_info || object_info->getFileName().empty()) if (!object_info || object_info->getPath().empty())
return {}; return {};
if (!object_info->metadata) if (!object_info->metadata)

View File

@ -0,0 +1,16 @@
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3

View File

@ -0,0 +1,28 @@
-- Tags: no-parallel, no-fasttest
-- Tag no-fasttest: Depends on AWS
SET s3_truncate_on_insert = 1;
SET s3_skip_empty_files = 0;
INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/', format=Parquet) SELECT 0 as num;
INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/file1', format=Parquet) SELECT 1 as num;
INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/file2', format=Parquet) SELECT 2 as num;
INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/file3', format=Parquet) SELECT 3 as num;
SELECT * FROM s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL SETTINGS max_threads = 1;
SELECT * FROM s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL SETTINGS max_threads = 4;
SELECT * FROM s3Cluster('test_cluster_two_shards_localhost', s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL SETTINGS max_threads = 1;
SELECT * FROM s3Cluster('test_cluster_two_shards_localhost', s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL SETTINGS max_threads = 4;
-- Empty "directory" files created implicitly by S3 console:
-- https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html
SELECT *
FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/wikistat/original/*', NOSIGN)
LIMIT 1
FORMAT Null;
SELECT *
FROM s3Cluster('test_cluster_two_shards_localhost', 'https://clickhouse-public-datasets.s3.amazonaws.com/wikistat/original/*', NOSIGN)
LIMIT 1
Format Null;