From 70a2061c9bccd85f6c939529609051c06042c563 Mon Sep 17 00:00:00 2001 From: divanik Date: Tue, 2 Jul 2024 12:27:01 +0000 Subject: [PATCH 1/2] Fixed bug and added test --- src/Disks/ObjectStorages/IObjectStorage.h | 1 + .../StorageObjectStorageSource.cpp | 41 ++++--- .../StorageObjectStorageSource.h | 6 +- .../03036_reading_s3_archives.reference | 104 +++++++++--------- .../0_stateless/03036_reading_s3_archives.sql | 30 ++--- 5 files changed, 96 insertions(+), 86 deletions(-) diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 9f5c14fdb7c..6410a9a7a73 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -75,6 +75,7 @@ struct RelativePathWithMetadata virtual std::string getPath() const { return relative_path; } virtual bool isArchive() const { return false; } virtual std::string getPathToArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); } + virtual size_t fileSizeInArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); } }; struct ObjectKeyWithMetadata diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index aef783fc3c4..9436e729683 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -196,13 +196,12 @@ Chunk StorageObjectStorageSource::generate() const auto & filename = object_info->getFileName(); chassert(object_info->metadata); VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( - chunk, read_from_format_info.requested_virtual_columns, - { - .path = getUniqueStoragePathIdentifier(*configuration, *object_info, false), - .size = object_info->metadata->size_bytes, - .filename = &filename, - .last_modified = object_info->metadata->last_modified - }); + chunk, + read_from_format_info.requested_virtual_columns, + {.path = getUniqueStoragePathIdentifier(*configuration, *object_info, false), + .size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes, + .filename = &filename, + .last_modified = object_info->metadata->last_modified}); return chunk; } @@ -690,10 +689,9 @@ static IArchiveReader::NameFilter createArchivePathFilter(const std::string & ar StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive::ObjectInfoInArchive( ObjectInfoPtr archive_object_, const std::string & path_in_archive_, - std::shared_ptr archive_reader_) - : archive_object(archive_object_) - , path_in_archive(path_in_archive_) - , archive_reader(archive_reader_) + std::shared_ptr archive_reader_, + IArchiveReader::FileInfo && file_info_) + : archive_object(archive_object_), path_in_archive(path_in_archive_), archive_reader(archive_reader_), file_info(file_info_) { } @@ -732,6 +730,7 @@ StorageObjectStorageSource::ObjectInfoPtr StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor) { std::unique_lock lock{next_mutex}; + IArchiveReader::FileInfo current_file_info{}; while (true) { if (filter) @@ -756,6 +755,8 @@ StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor) path_in_archive = file_enumerator->getFileName(); if (!filter(path_in_archive)) continue; + else + current_file_info = file_enumerator->getFileInfo(); } else { @@ -769,15 +770,19 @@ StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor) archive_reader = createArchiveReader(archive_object); if (!archive_reader->fileExists(path_in_archive)) continue; + else + current_file_info = archive_reader->getFileInfo(path_in_archive); } - - auto object_in_archive = std::make_shared(archive_object, path_in_archive, archive_reader); - - if (read_keys != nullptr) - read_keys->push_back(object_in_archive); - - return object_in_archive; + break; } + + auto object_in_archive + = std::make_shared(archive_object, path_in_archive, archive_reader, std::move(current_file_info)); + + if (read_keys != nullptr) + read_keys->push_back(object_in_archive); + + return object_in_archive; } size_t StorageObjectStorageSource::ArchiveIterator::estimatedKeysCount() diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index d93097d2636..2cbe8a9776c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -259,7 +259,8 @@ public: ObjectInfoInArchive( ObjectInfoPtr archive_object_, const std::string & path_in_archive_, - std::shared_ptr archive_reader_); + std::shared_ptr archive_reader_, + IArchiveReader::FileInfo && file_info_); std::string getFileName() const override { @@ -278,9 +279,12 @@ public: bool isArchive() const override { return true; } + size_t fileSizeInArchive() const override { return file_info.uncompressed_size; } + const ObjectInfoPtr archive_object; const std::string path_in_archive; const std::shared_ptr archive_reader; + const IArchiveReader::FileInfo file_info; }; private: diff --git a/tests/queries/0_stateless/03036_reading_s3_archives.reference b/tests/queries/0_stateless/03036_reading_s3_archives.reference index 36ced212a1b..eacf16d0295 100644 --- a/tests/queries/0_stateless/03036_reading_s3_archives.reference +++ b/tests/queries/0_stateless/03036_reading_s3_archives.reference @@ -1,52 +1,52 @@ -1 Str1 example1.csv test/03036_archive1.zip::example1.csv -2 Str2 example1.csv test/03036_archive1.zip::example1.csv -3 Str3 example2.csv test/03036_archive2.zip::example2.csv -4 Str4 example2.csv test/03036_archive2.zip::example2.csv -5 Str5 example3.csv test/03036_archive2.zip::example3.csv -6 Str6 example3.csv test/03036_archive2.zip::example3.csv -3 Str3 example2.csv test/03036_archive1.zip::example2.csv -3 Str3 example2.csv test/03036_archive2.zip::example2.csv -4 Str4 example2.csv test/03036_archive1.zip::example2.csv -4 Str4 example2.csv test/03036_archive2.zip::example2.csv -1 Str1 example1.csv test/03036_archive1.zip::example1.csv -2 Str2 example1.csv test/03036_archive1.zip::example1.csv -3 Str3 example2.csv test/03036_archive1.zip::example2.csv -3 Str3 example2.csv test/03036_archive2.zip::example2.csv -4 Str4 example2.csv test/03036_archive1.zip::example2.csv -4 Str4 example2.csv test/03036_archive2.zip::example2.csv -5 Str5 example3.csv test/03036_archive2.zip::example3.csv -6 Str6 example3.csv test/03036_archive2.zip::example3.csv -1 Str1 example1.csv test/03036_archive1.tar::example1.csv -2 Str2 example1.csv test/03036_archive1.tar::example1.csv -7 Str7 example4.csv test/03036_archive1.tar::example4.csv -7 Str7 example4.csv test/03036_archive2.tar::example4.csv -8 Str8 example4.csv test/03036_archive1.tar::example4.csv -8 Str8 example4.csv test/03036_archive2.tar::example4.csv -5 Str5 example3.csv test/03036_archive2.tar::example3.csv -6 Str6 example3.csv test/03036_archive2.tar::example3.csv -7 Str7 example4.csv test/03036_archive2.tar::example4.csv -8 Str8 example4.csv test/03036_archive2.tar::example4.csv -9 Str9 example5.csv test/03036_archive2.tar::example5.csv -10 Str10 example5.csv test/03036_archive2.tar::example5.csv -3 Str3 example2.csv test/03036_archive3.tar.gz::example2.csv -4 Str4 example2.csv test/03036_archive3.tar.gz::example2.csv -11 Str11 example6.csv test/03036_archive3.tar.gz::example6.csv -12 Str12 example6.csv test/03036_archive3.tar.gz::example6.csv -3 Str3 example2.csv test/03036_archive3.tar.gz::example2.csv -4 Str4 example2.csv test/03036_archive3.tar.gz::example2.csv -5 Str5 example3.csv test/03036_archive2.tar::example3.csv -6 Str6 example3.csv test/03036_archive2.tar::example3.csv -3 Str3 example2.csv test/03036_archive2.zip::example2.csv -4 Str4 example2.csv test/03036_archive2.zip::example2.csv -5 Str5 example3.csv test/03036_archive2.tar::example3.csv -6 Str6 example3.csv test/03036_archive2.tar::example3.csv -7 Str7 example4.csv test/03036_archive2.tar::example4.csv -8 Str8 example4.csv test/03036_archive2.tar::example4.csv -9 Str9 example5.csv test/03036_archive2.tar::example5.csv -10 Str10 example5.csv test/03036_archive2.tar::example5.csv -3 Str3 example2.csv test/03036_archive3.tar.gz::example2.csv -4 Str4 example2.csv test/03036_archive3.tar.gz::example2.csv -5 Str5 example3.csv test/03036_archive2.tar::example3.csv -6 Str6 example3.csv test/03036_archive2.tar::example3.csv -13 Str13 example7.csv test/03036_compressed_file_archive.zip::example7.csv -14 Str14 example7.csv test/03036_compressed_file_archive.zip::example7.csv +1 Str1 25 example1.csv test/03036_archive1.zip::example1.csv +2 Str2 25 example1.csv test/03036_archive1.zip::example1.csv +3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv +4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv +5 Str5 25 example3.csv test/03036_archive2.zip::example3.csv +6 Str6 25 example3.csv test/03036_archive2.zip::example3.csv +3 Str3 25 example2.csv test/03036_archive1.zip::example2.csv +3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv +4 Str4 25 example2.csv test/03036_archive1.zip::example2.csv +4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv +1 Str1 25 example1.csv test/03036_archive1.zip::example1.csv +2 Str2 25 example1.csv test/03036_archive1.zip::example1.csv +3 Str3 25 example2.csv test/03036_archive1.zip::example2.csv +3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv +4 Str4 25 example2.csv test/03036_archive1.zip::example2.csv +4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv +5 Str5 25 example3.csv test/03036_archive2.zip::example3.csv +6 Str6 25 example3.csv test/03036_archive2.zip::example3.csv +1 Str1 25 example1.csv test/03036_archive1.tar::example1.csv +2 Str2 25 example1.csv test/03036_archive1.tar::example1.csv +7 Str7 25 example4.csv test/03036_archive1.tar::example4.csv +7 Str7 25 example4.csv test/03036_archive2.tar::example4.csv +8 Str8 25 example4.csv test/03036_archive1.tar::example4.csv +8 Str8 25 example4.csv test/03036_archive2.tar::example4.csv +5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv +6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv +7 Str7 25 example4.csv test/03036_archive2.tar::example4.csv +8 Str8 25 example4.csv test/03036_archive2.tar::example4.csv +9 Str9 27 example5.csv test/03036_archive2.tar::example5.csv +10 Str10 27 example5.csv test/03036_archive2.tar::example5.csv +3 Str3 25 example2.csv test/03036_archive3.tar.gz::example2.csv +4 Str4 25 example2.csv test/03036_archive3.tar.gz::example2.csv +11 Str11 29 example6.csv test/03036_archive3.tar.gz::example6.csv +12 Str12 29 example6.csv test/03036_archive3.tar.gz::example6.csv +3 Str3 25 example2.csv test/03036_archive3.tar.gz::example2.csv +4 Str4 25 example2.csv test/03036_archive3.tar.gz::example2.csv +5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv +6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv +3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv +4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv +5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv +6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv +7 Str7 25 example4.csv test/03036_archive2.tar::example4.csv +8 Str8 25 example4.csv test/03036_archive2.tar::example4.csv +9 Str9 27 example5.csv test/03036_archive2.tar::example5.csv +10 Str10 27 example5.csv test/03036_archive2.tar::example5.csv +3 Str3 25 example2.csv test/03036_archive3.tar.gz::example2.csv +4 Str4 25 example2.csv test/03036_archive3.tar.gz::example2.csv +5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv +6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv +13 Str13 57 example7.csv test/03036_compressed_file_archive.zip::example7.csv +14 Str14 57 example7.csv test/03036_compressed_file_archive.zip::example7.csv diff --git a/tests/queries/0_stateless/03036_reading_s3_archives.sql b/tests/queries/0_stateless/03036_reading_s3_archives.sql index 00d7cc25e1a..43bda4ee704 100644 --- a/tests/queries/0_stateless/03036_reading_s3_archives.sql +++ b/tests/queries/0_stateless/03036_reading_s3_archives.sql @@ -1,22 +1,22 @@ -- Tags: no-fasttest -- Tag no-fasttest: Depends on AWS -SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive1.zip :: example1.csv') ORDER BY (id, _file, _path); -SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive2.zip :: example*.csv') ORDER BY (id, _file, _path); -SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example2.csv') ORDER BY (id, _file, _path); -SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example*') ORDER BY (id, _file, _path); -SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive1.tar :: example1.csv') ORDER BY (id, _file, _path); -SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar :: example4.csv') ORDER BY (id, _file, _path); -SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive2.tar :: example*.csv') ORDER BY (id, _file, _path); -SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar.gz :: example*.csv') ORDER BY (id, _file, _path); -SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv') ORDER BY (id, _file, _path); -select id, data, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } -select id, data, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent{2..3}.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } +SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive1.zip :: example1.csv') ORDER BY (id, _file, _path); +SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive2.zip :: example*.csv') ORDER BY (id, _file, _path); +SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example2.csv') ORDER BY (id, _file, _path); +SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example*') ORDER BY (id, _file, _path); +SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive1.tar :: example1.csv') ORDER BY (id, _file, _path); +SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar :: example4.csv') ORDER BY (id, _file, _path); +SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive2.tar :: example*.csv') ORDER BY (id, _file, _path); +SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar.gz :: example*.csv') ORDER BY (id, _file, _path); +SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv') ORDER BY (id, _file, _path); +select id, data, _size, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } +select id, data, _size, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent{2..3}.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } CREATE TABLE table_zip22 Engine S3(s3_conn, filename='03036_archive2.zip :: example2.csv'); -select id, data, _file, _path from table_zip22 ORDER BY (id, _file, _path); +select id, data, _size, _file, _path from table_zip22 ORDER BY (id, _file, _path); CREATE table table_tar2star Engine S3(s3_conn, filename='03036_archive2.tar :: example*.csv'); -SELECT id, data, _file, _path FROM table_tar2star ORDER BY (id, _file, _path); +SELECT id, data, _size, _file, _path FROM table_tar2star ORDER BY (id, _file, _path); CREATE table table_tarstarglobs Engine S3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv'); -SELECT id, data, _file, _path FROM table_tarstarglobs ORDER BY (id, _file, _path); +SELECT id, data, _size, _file, _path FROM table_tarstarglobs ORDER BY (id, _file, _path); CREATE table table_noexist Engine s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError UNKNOWN_STORAGE } -SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_compressed_file_archive.zip :: example7.csv', format='CSV', structure='auto', compression_method='gz') ORDER BY (id, _file, _path) +SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_compressed_file_archive.zip :: example7.csv', format='CSV', structure='auto', compression_method='gz') ORDER BY (id, _file, _path) From f2244853164906cbf6faf186b7f3782ccc504d5a Mon Sep 17 00:00:00 2001 From: divanik Date: Tue, 2 Jul 2024 13:01:33 +0000 Subject: [PATCH 2/2] Add reference to documentation --- docs/en/sql-reference/table-functions/s3.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 1a7e2b8d66a..35e5d86034c 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -269,9 +269,9 @@ FROM s3( ## Virtual Columns {#virtual-columns} -- `_path` — Path to the file. Type: `LowCardinalty(String)`. -- `_file` — Name of the file. Type: `LowCardinalty(String)`. -- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. +- `_path` — Path to the file. Type: `LowCardinalty(String)`. In case of archive, shows path in a format: "{path_to_archive}::{path_to_file_inside_archive}" +- `_file` — Name of the file. Type: `LowCardinalty(String)`. In case of archive shows name of the file inside the archive. +- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. In case of archive shows uncompressed file size of the file inside the archive. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. ## Storage Settings {#storage-settings}