Merge pull request #65993 from ClickHouse/divanik/fix_size_column_in_s3_archives

Fix bug with _size of archives in s3 table function
2024-11-24 16:42:05 +00:00 · 2024-07-03 13:38:01 +00:00 · 2024-07-03 13:38:01 +00:00 · 4ab2d514e6
commit 4ab2d514e6
parent d3ede543bb 97215f473a
6 changed files with 99 additions and 90 deletions
--- a/docs/en/sql-reference/table-functions/s3.md
+++ b/docs/en/sql-reference/table-functions/s3.md
@ -269,9 +269,9 @@ FROM s3(

 ## Virtual Columns {#virtual-columns}

- `_path` — Path to the file. Type: `LowCardinalty(String)`.
- `_file` — Name of the file. Type: `LowCardinalty(String)`.
- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`.
+- `_path` — Path to the file. Type: `LowCardinalty(String)`. In case of archive, shows path in a format: "{path_to_archive}::{path_to_file_inside_archive}"
+- `_file` — Name of the file. Type: `LowCardinalty(String)`. In case of archive shows name of the file inside the archive.
+- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. In case of archive shows uncompressed file size of the file inside the archive. 
 - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`.

 ## Storage Settings {#storage-settings}
--- a/src/Disks/ObjectStorages/IObjectStorage.h
+++ b/src/Disks/ObjectStorages/IObjectStorage.h
@ -75,6 +75,7 @@ struct RelativePathWithMetadata
    virtual std::string getPath() const { return relative_path; }
    virtual bool isArchive() const { return false; }
    virtual std::string getPathToArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); }
+    virtual size_t fileSizeInArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); }
 };

 struct ObjectKeyWithMetadata
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@ -196,13 +196,12 @@ Chunk StorageObjectStorageSource::generate()
            const auto & filename = object_info->getFileName();
            chassert(object_info->metadata);
            VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk(
-                chunk, read_from_format_info.requested_virtual_columns,
-                {
-                    .path = getUniqueStoragePathIdentifier(*configuration, *object_info, false),
-                    .size = object_info->metadata->size_bytes,
-                    .filename = &filename,
-                    .last_modified = object_info->metadata->last_modified
-                });
+                chunk,
+                read_from_format_info.requested_virtual_columns,
+                {.path = getUniqueStoragePathIdentifier(*configuration, *object_info, false),
+                 .size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes,
+                 .filename = &filename,
+                 .last_modified = object_info->metadata->last_modified});

            const auto & partition_columns = configuration->getPartitionColumns();
            if (!partition_columns.empty() && chunk_size && chunk.hasColumns())
@ -227,7 +226,6 @@ Chunk StorageObjectStorageSource::generate()
                        chunk.addColumn(std::move(partition_column));
                }
            }
-
            return chunk;
        }

@ -715,10 +713,9 @@ static IArchiveReader::NameFilter createArchivePathFilter(const std::string & ar
 StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive::ObjectInfoInArchive(
    ObjectInfoPtr archive_object_,
    const std::string & path_in_archive_,
-    std::shared_ptr<IArchiveReader> archive_reader_)
-    : archive_object(archive_object_)
-    , path_in_archive(path_in_archive_)
-    , archive_reader(archive_reader_)
+    std::shared_ptr<IArchiveReader> archive_reader_,
+    IArchiveReader::FileInfo && file_info_)
+    : archive_object(archive_object_), path_in_archive(path_in_archive_), archive_reader(archive_reader_), file_info(file_info_)
 {
 }

@ -757,6 +754,7 @@ StorageObjectStorageSource::ObjectInfoPtr
 StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor)
 {
    std::unique_lock lock{next_mutex};
+    IArchiveReader::FileInfo current_file_info{};
    while (true)
    {
        if (filter)
@ -781,6 +779,8 @@ StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor)
            path_in_archive = file_enumerator->getFileName();
            if (!filter(path_in_archive))
                continue;
+            else
+                current_file_info = file_enumerator->getFileInfo();
        }
        else
        {
@ -794,15 +794,19 @@ StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor)
            archive_reader = createArchiveReader(archive_object);
            if (!archive_reader->fileExists(path_in_archive))
                continue;
+            else
+                current_file_info = archive_reader->getFileInfo(path_in_archive);
        }
-
-        auto object_in_archive = std::make_shared<ObjectInfoInArchive>(archive_object, path_in_archive, archive_reader);
-
-        if (read_keys != nullptr)
-            read_keys->push_back(object_in_archive);
-
-        return object_in_archive;
+        break;
    }
+
+    auto object_in_archive
+        = std::make_shared<ObjectInfoInArchive>(archive_object, path_in_archive, archive_reader, std::move(current_file_info));
+
+    if (read_keys != nullptr)
+        read_keys->push_back(object_in_archive);
+
+    return object_in_archive;
 }

 size_t StorageObjectStorageSource::ArchiveIterator::estimatedKeysCount()
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h
@ -260,7 +260,8 @@ public:
        ObjectInfoInArchive(
            ObjectInfoPtr archive_object_,
            const std::string & path_in_archive_,
-            std::shared_ptr<IArchiveReader> archive_reader_);
+            std::shared_ptr<IArchiveReader> archive_reader_,
+            IArchiveReader::FileInfo && file_info_);

        std::string getFileName() const override
        {
@ -279,9 +280,12 @@ public:

        bool isArchive() const override { return true; }

+        size_t fileSizeInArchive() const override { return file_info.uncompressed_size; }
+
        const ObjectInfoPtr archive_object;
        const std::string path_in_archive;
        const std::shared_ptr<IArchiveReader> archive_reader;
+        const IArchiveReader::FileInfo file_info;
    };

 private:
--- a/tests/queries/0_stateless/03036_reading_s3_archives.reference
+++ b/tests/queries/0_stateless/03036_reading_s3_archives.reference
@ -1,52 +1,52 @@
-1	Str1	example1.csv	test/03036_archive1.zip::example1.csv
-2	Str2	example1.csv	test/03036_archive1.zip::example1.csv
-3	Str3	example2.csv	test/03036_archive2.zip::example2.csv
-4	Str4	example2.csv	test/03036_archive2.zip::example2.csv
-5	Str5	example3.csv	test/03036_archive2.zip::example3.csv
-6	Str6	example3.csv	test/03036_archive2.zip::example3.csv
-3	Str3	example2.csv	test/03036_archive1.zip::example2.csv
-3	Str3	example2.csv	test/03036_archive2.zip::example2.csv
-4	Str4	example2.csv	test/03036_archive1.zip::example2.csv
-4	Str4	example2.csv	test/03036_archive2.zip::example2.csv
-1	Str1	example1.csv	test/03036_archive1.zip::example1.csv
-2	Str2	example1.csv	test/03036_archive1.zip::example1.csv
-3	Str3	example2.csv	test/03036_archive1.zip::example2.csv
-3	Str3	example2.csv	test/03036_archive2.zip::example2.csv
-4	Str4	example2.csv	test/03036_archive1.zip::example2.csv
-4	Str4	example2.csv	test/03036_archive2.zip::example2.csv
-5	Str5	example3.csv	test/03036_archive2.zip::example3.csv
-6	Str6	example3.csv	test/03036_archive2.zip::example3.csv
-1	Str1	example1.csv	test/03036_archive1.tar::example1.csv
-2	Str2	example1.csv	test/03036_archive1.tar::example1.csv
-7	Str7	example4.csv	test/03036_archive1.tar::example4.csv
-7	Str7	example4.csv	test/03036_archive2.tar::example4.csv
-8	Str8	example4.csv	test/03036_archive1.tar::example4.csv
-8	Str8	example4.csv	test/03036_archive2.tar::example4.csv
-5	Str5	example3.csv	test/03036_archive2.tar::example3.csv
-6	Str6	example3.csv	test/03036_archive2.tar::example3.csv
-7	Str7	example4.csv	test/03036_archive2.tar::example4.csv
-8	Str8	example4.csv	test/03036_archive2.tar::example4.csv
-9	Str9	example5.csv	test/03036_archive2.tar::example5.csv
-10	Str10	example5.csv	test/03036_archive2.tar::example5.csv
-3	Str3	example2.csv	test/03036_archive3.tar.gz::example2.csv
-4	Str4	example2.csv	test/03036_archive3.tar.gz::example2.csv
-11	Str11	example6.csv	test/03036_archive3.tar.gz::example6.csv
-12	Str12	example6.csv	test/03036_archive3.tar.gz::example6.csv
-3	Str3	example2.csv	test/03036_archive3.tar.gz::example2.csv
-4	Str4	example2.csv	test/03036_archive3.tar.gz::example2.csv
-5	Str5	example3.csv	test/03036_archive2.tar::example3.csv
-6	Str6	example3.csv	test/03036_archive2.tar::example3.csv
-3	Str3	example2.csv	test/03036_archive2.zip::example2.csv
-4	Str4	example2.csv	test/03036_archive2.zip::example2.csv
-5	Str5	example3.csv	test/03036_archive2.tar::example3.csv
-6	Str6	example3.csv	test/03036_archive2.tar::example3.csv
-7	Str7	example4.csv	test/03036_archive2.tar::example4.csv
-8	Str8	example4.csv	test/03036_archive2.tar::example4.csv
-9	Str9	example5.csv	test/03036_archive2.tar::example5.csv
-10	Str10	example5.csv	test/03036_archive2.tar::example5.csv
-3	Str3	example2.csv	test/03036_archive3.tar.gz::example2.csv
-4	Str4	example2.csv	test/03036_archive3.tar.gz::example2.csv
-5	Str5	example3.csv	test/03036_archive2.tar::example3.csv
-6	Str6	example3.csv	test/03036_archive2.tar::example3.csv
-13	Str13	example7.csv	test/03036_compressed_file_archive.zip::example7.csv
-14	Str14	example7.csv	test/03036_compressed_file_archive.zip::example7.csv
+1	Str1	25	example1.csv	test/03036_archive1.zip::example1.csv
+2	Str2	25	example1.csv	test/03036_archive1.zip::example1.csv
+3	Str3	25	example2.csv	test/03036_archive2.zip::example2.csv
+4	Str4	25	example2.csv	test/03036_archive2.zip::example2.csv
+5	Str5	25	example3.csv	test/03036_archive2.zip::example3.csv
+6	Str6	25	example3.csv	test/03036_archive2.zip::example3.csv
+3	Str3	25	example2.csv	test/03036_archive1.zip::example2.csv
+3	Str3	25	example2.csv	test/03036_archive2.zip::example2.csv
+4	Str4	25	example2.csv	test/03036_archive1.zip::example2.csv
+4	Str4	25	example2.csv	test/03036_archive2.zip::example2.csv
+1	Str1	25	example1.csv	test/03036_archive1.zip::example1.csv
+2	Str2	25	example1.csv	test/03036_archive1.zip::example1.csv
+3	Str3	25	example2.csv	test/03036_archive1.zip::example2.csv
+3	Str3	25	example2.csv	test/03036_archive2.zip::example2.csv
+4	Str4	25	example2.csv	test/03036_archive1.zip::example2.csv
+4	Str4	25	example2.csv	test/03036_archive2.zip::example2.csv
+5	Str5	25	example3.csv	test/03036_archive2.zip::example3.csv
+6	Str6	25	example3.csv	test/03036_archive2.zip::example3.csv
+1	Str1	25	example1.csv	test/03036_archive1.tar::example1.csv
+2	Str2	25	example1.csv	test/03036_archive1.tar::example1.csv
+7	Str7	25	example4.csv	test/03036_archive1.tar::example4.csv
+7	Str7	25	example4.csv	test/03036_archive2.tar::example4.csv
+8	Str8	25	example4.csv	test/03036_archive1.tar::example4.csv
+8	Str8	25	example4.csv	test/03036_archive2.tar::example4.csv
+5	Str5	25	example3.csv	test/03036_archive2.tar::example3.csv
+6	Str6	25	example3.csv	test/03036_archive2.tar::example3.csv
+7	Str7	25	example4.csv	test/03036_archive2.tar::example4.csv
+8	Str8	25	example4.csv	test/03036_archive2.tar::example4.csv
+9	Str9	27	example5.csv	test/03036_archive2.tar::example5.csv
+10	Str10	27	example5.csv	test/03036_archive2.tar::example5.csv
+3	Str3	25	example2.csv	test/03036_archive3.tar.gz::example2.csv
+4	Str4	25	example2.csv	test/03036_archive3.tar.gz::example2.csv
+11	Str11	29	example6.csv	test/03036_archive3.tar.gz::example6.csv
+12	Str12	29	example6.csv	test/03036_archive3.tar.gz::example6.csv
+3	Str3	25	example2.csv	test/03036_archive3.tar.gz::example2.csv
+4	Str4	25	example2.csv	test/03036_archive3.tar.gz::example2.csv
+5	Str5	25	example3.csv	test/03036_archive2.tar::example3.csv
+6	Str6	25	example3.csv	test/03036_archive2.tar::example3.csv
+3	Str3	25	example2.csv	test/03036_archive2.zip::example2.csv
+4	Str4	25	example2.csv	test/03036_archive2.zip::example2.csv
+5	Str5	25	example3.csv	test/03036_archive2.tar::example3.csv
+6	Str6	25	example3.csv	test/03036_archive2.tar::example3.csv
+7	Str7	25	example4.csv	test/03036_archive2.tar::example4.csv
+8	Str8	25	example4.csv	test/03036_archive2.tar::example4.csv
+9	Str9	27	example5.csv	test/03036_archive2.tar::example5.csv
+10	Str10	27	example5.csv	test/03036_archive2.tar::example5.csv
+3	Str3	25	example2.csv	test/03036_archive3.tar.gz::example2.csv
+4	Str4	25	example2.csv	test/03036_archive3.tar.gz::example2.csv
+5	Str5	25	example3.csv	test/03036_archive2.tar::example3.csv
+6	Str6	25	example3.csv	test/03036_archive2.tar::example3.csv
+13	Str13	57	example7.csv	test/03036_compressed_file_archive.zip::example7.csv
+14	Str14	57	example7.csv	test/03036_compressed_file_archive.zip::example7.csv
--- a/tests/queries/0_stateless/03036_reading_s3_archives.sql
+++ b/tests/queries/0_stateless/03036_reading_s3_archives.sql
@ -1,22 +1,22 @@
 -- Tags: no-fasttest
 -- Tag no-fasttest: Depends on AWS

-SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive1.zip :: example1.csv') ORDER BY (id, _file, _path);
-SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive2.zip :: example*.csv') ORDER BY (id, _file, _path);
-SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example2.csv') ORDER BY (id, _file, _path);
-SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example*') ORDER BY (id, _file, _path);
-SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive1.tar :: example1.csv') ORDER BY (id, _file, _path);
-SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar :: example4.csv') ORDER BY (id, _file, _path);
-SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive2.tar :: example*.csv') ORDER BY (id, _file, _path);
-SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar.gz :: example*.csv') ORDER BY (id, _file, _path);
-SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv') ORDER BY (id, _file, _path);
-select id, data, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
-select id, data, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent{2..3}.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
+SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive1.zip :: example1.csv') ORDER BY (id, _file, _path);
+SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive2.zip :: example*.csv') ORDER BY (id, _file, _path);
+SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example2.csv') ORDER BY (id, _file, _path);
+SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example*') ORDER BY (id, _file, _path);
+SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive1.tar :: example1.csv') ORDER BY (id, _file, _path);
+SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar :: example4.csv') ORDER BY (id, _file, _path);
+SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive2.tar :: example*.csv') ORDER BY (id, _file, _path);
+SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar.gz :: example*.csv') ORDER BY (id, _file, _path);
+SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv') ORDER BY (id, _file, _path);
+select id, data, _size, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
+select id, data, _size, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent{2..3}.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
 CREATE TABLE table_zip22 Engine S3(s3_conn, filename='03036_archive2.zip :: example2.csv');
-select id, data, _file, _path from table_zip22 ORDER BY (id, _file, _path);
+select id, data, _size, _file, _path from table_zip22 ORDER BY (id, _file, _path);
 CREATE table table_tar2star Engine S3(s3_conn, filename='03036_archive2.tar :: example*.csv');
-SELECT id, data, _file, _path FROM table_tar2star ORDER BY (id, _file, _path);
+SELECT id, data, _size, _file, _path FROM table_tar2star ORDER BY (id, _file, _path);
 CREATE table table_tarstarglobs Engine S3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv');
-SELECT id, data, _file, _path FROM table_tarstarglobs ORDER BY (id, _file, _path);
+SELECT id, data, _size, _file, _path FROM table_tarstarglobs ORDER BY (id, _file, _path);
 CREATE table table_noexist Engine s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError UNKNOWN_STORAGE }
-SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_compressed_file_archive.zip :: example7.csv', format='CSV', structure='auto', compression_method='gz') ORDER BY (id, _file, _path)
+SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_compressed_file_archive.zip :: example7.csv', format='CSV', structure='auto', compression_method='gz') ORDER BY (id, _file, _path)