From 1c682f13163522c2a38634f7566e55d7bebc7b4a Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Thu, 14 Nov 2024 22:38:05 +0100 Subject: [PATCH 1/6] impl --- .../ObjectStorage/StorageObjectStorageSource.cpp | 2 +- .../03271_s3_table_function_asterisk_glob.reference | 4 ++++ .../03271_s3_table_function_asterisk_glob.sql | 12 ++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference create mode 100644 tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 1ccf23ade90..105bea87dc4 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -305,7 +305,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade { object_info = file_iterator->next(processor); - if (!object_info || object_info->getFileName().empty()) + if (!object_info) return {}; if (!object_info->metadata) diff --git a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference new file mode 100644 index 00000000000..bc856dafab0 --- /dev/null +++ b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference @@ -0,0 +1,4 @@ +0 +1 +2 +3 diff --git a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql new file mode 100644 index 00000000000..0dabd3de7ba --- /dev/null +++ b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql @@ -0,0 +1,12 @@ +-- Tags: no-parallel, no-fasttest +-- Tag no-fasttest: Depends on AWS + +SET max_threads = 1; +SET s3_truncate_on_insert = 1; + +INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/', format=Parquet) SELECT 0 as num; +INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/file1', format=Parquet) SELECT 1 as num; +INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/file2', format=Parquet) SELECT 2 as num; +INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/file3', format=Parquet) SELECT 3 as num; + +SELECT * FROM s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL; From a82ab36c08b02918779afed9dd27ac237cb3416e Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 15 Nov 2024 12:12:43 +0100 Subject: [PATCH 2/6] try fix --- src/Storages/ObjectStorage/ReadBufferIterator.cpp | 1 - src/Storages/ObjectStorage/StorageObjectStorageSource.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 7dd3ad0d79f..7c184f52872 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -218,7 +218,6 @@ ReadBufferIterator::Data ReadBufferIterator::next() } const auto filename = current_object_info->getFileName(); - chassert(!filename.empty()); /// file iterator could get new keys after new iteration if (read_keys.size() > prev_read_keys_size) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 105bea87dc4..fd99b832973 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -305,7 +305,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade { object_info = file_iterator->next(processor); - if (!object_info) + if (!object_info || object_info->getPath().empty()) return {}; if (!object_info->metadata) From e45dd36343915d3504067126b913e60faae7be0b Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 18 Nov 2024 13:38:26 +0100 Subject: [PATCH 3/6] add test from Christoph --- .../03271_s3_table_function_asterisk_glob.reference | 4 ++++ .../03271_s3_table_function_asterisk_glob.sql | 13 +++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference index bc856dafab0..d4dc73eff36 100644 --- a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference +++ b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference @@ -2,3 +2,7 @@ 1 2 3 +0 +1 +2 +3 diff --git a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql index 0dabd3de7ba..b0937a073c1 100644 --- a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql +++ b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql @@ -1,12 +1,21 @@ -- Tags: no-parallel, no-fasttest -- Tag no-fasttest: Depends on AWS -SET max_threads = 1; SET s3_truncate_on_insert = 1; +SET s3_skip_empty_files = 0; INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/', format=Parquet) SELECT 0 as num; INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/file1', format=Parquet) SELECT 1 as num; INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/file2', format=Parquet) SELECT 2 as num; INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/file3', format=Parquet) SELECT 3 as num; -SELECT * FROM s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL; +SELECT * FROM s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL SETTINGS max_threads = 1; +SELECT * FROM s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL SETTINGS max_threads = 4; + +-- Empty "directory" files created implicitly by S3 console: +-- https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html +SELECT * +FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/*', NOSIGN) +LIMIT 1 +FORMAT Null +SETTINGS s3_skip_empty_files = 1; From 3a012e5a967af5949d9ce228a8b765146aa7021e Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 18 Nov 2024 13:55:18 +0100 Subject: [PATCH 4/6] enable s3_skip_empty_files by default --- docs/en/engines/table-engines/integrations/s3.md | 2 +- docs/en/sql-reference/table-functions/s3.md | 2 +- src/Core/Settings.cpp | 2 +- src/Core/SettingsChangesHistory.cpp | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index fd27d4b6ed9..9868b2a05a8 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -258,7 +258,7 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) - [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default. - [s3_create_new_file_on_insert](/docs/en/operations/settings/settings.md#s3_create_new_file_on_insert) - allows to create a new file on each insert if format has suffix. Disabled by default. -- [s3_skip_empty_files](/docs/en/operations/settings/settings.md#s3_skip_empty_files) - allows to skip empty files while reading. Disabled by default. +- [s3_skip_empty_files](/docs/en/operations/settings/settings.md#s3_skip_empty_files) - allows to skip empty files while reading. Enabled by default. ## S3-related Settings {#settings} diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index b14eb84392f..ea7820c1aec 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -317,7 +317,7 @@ SELECT * from s3('s3://data/path/date=*/country=*/code=*/*.parquet') where _date - [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default. - [s3_create_new_file_on_insert](/docs/en/operations/settings/settings.md#s3_create_new_file_on_insert) - allows to create a new file on each insert if format has suffix. Disabled by default. -- [s3_skip_empty_files](/docs/en/operations/settings/settings.md#s3_skip_empty_files) - allows to skip empty files while reading. Disabled by default. +- [s3_skip_empty_files](/docs/en/operations/settings/settings.md#s3_skip_empty_files) - allows to skip empty files while reading. Enabled by default. **See Also** diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 0e2e1f8a3f0..2fee9b4a718 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -430,7 +430,7 @@ Possible values: - 0 — `INSERT` query appends new data to the end of the file. - 1 — `INSERT` query creates a new file. )", 0) \ - DECLARE(Bool, s3_skip_empty_files, false, R"( + DECLARE(Bool, s3_skip_empty_files, true, R"( Enables or disables skipping empty files in [S3](../../engines/table-engines/integrations/s3.md) engine tables. Possible values: diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index dc95054f722..b4e0a29ec25 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -84,6 +84,7 @@ static std::initializer_list Date: Mon, 18 Nov 2024 14:04:02 +0100 Subject: [PATCH 5/6] upd test --- .../03271_s3_table_function_asterisk_glob.reference | 8 ++++++++ .../03271_s3_table_function_asterisk_glob.sql | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference index d4dc73eff36..373831be4eb 100644 --- a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference +++ b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.reference @@ -6,3 +6,11 @@ 1 2 3 +0 +1 +2 +3 +0 +1 +2 +3 diff --git a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql index b0937a073c1..37d974c5503 100644 --- a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql +++ b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql @@ -12,6 +12,9 @@ INSERT INTO FUNCTION s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk SELECT * FROM s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL SETTINGS max_threads = 1; SELECT * FROM s3(s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL SETTINGS max_threads = 4; +SELECT * FROM s3Cluster('test_cluster_two_shards_localhost', s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL SETTINGS max_threads = 1; +SELECT * FROM s3Cluster('test_cluster_two_shards_localhost', s3_conn, filename='dir1/03271_s3_table_function_asterisk_glob/*') ORDER BY ALL SETTINGS max_threads = 4; + -- Empty "directory" files created implicitly by S3 console: -- https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html SELECT * @@ -19,3 +22,10 @@ FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_review LIMIT 1 FORMAT Null SETTINGS s3_skip_empty_files = 1; + +-- TODO: Still doesn't work +-- SELECT * +-- FROM s3Cluster('test_cluster_two_shards_localhost', 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/*', NOSIGN) +-- LIMIT 1 +-- FORMAT Null +-- SETTINGS s3_skip_empty_files = 1; From 1c5d0c7f933b38daf608e4c3d7f63874aa559840 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 19 Nov 2024 19:43:56 +0100 Subject: [PATCH 6/6] fix test --- .../03271_s3_table_function_asterisk_glob.sql | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql index 37d974c5503..d3dba883f23 100644 --- a/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql +++ b/tests/queries/0_stateless/03271_s3_table_function_asterisk_glob.sql @@ -18,14 +18,11 @@ SELECT * FROM s3Cluster('test_cluster_two_shards_localhost', s3_conn, filename=' -- Empty "directory" files created implicitly by S3 console: -- https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html SELECT * -FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/*', NOSIGN) +FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/wikistat/original/*', NOSIGN) LIMIT 1 -FORMAT Null -SETTINGS s3_skip_empty_files = 1; +FORMAT Null; --- TODO: Still doesn't work --- SELECT * --- FROM s3Cluster('test_cluster_two_shards_localhost', 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/*', NOSIGN) --- LIMIT 1 --- FORMAT Null --- SETTINGS s3_skip_empty_files = 1; +SELECT * +FROM s3Cluster('test_cluster_two_shards_localhost', 'https://clickhouse-public-datasets.s3.amazonaws.com/wikistat/original/*', NOSIGN) +LIMIT 1 +Format Null;