Merge pull request #65993 from ClickHouse/divanik/fix_size_column_in_s3_archives

Fix bug with _size of archives in s3 table function
This commit is contained in:
Daniil Ivanik 2024-07-03 13:38:01 +00:00 committed by GitHub
commit 4ab2d514e6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 99 additions and 90 deletions

View File

@ -269,9 +269,9 @@ FROM s3(
## Virtual Columns {#virtual-columns}
- `_path` — Path to the file. Type: `LowCardinalty(String)`.
- `_file` — Name of the file. Type: `LowCardinalty(String)`.
- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`.
- `_path` — Path to the file. Type: `LowCardinalty(String)`. In case of archive, shows path in a format: "{path_to_archive}::{path_to_file_inside_archive}"
- `_file` — Name of the file. Type: `LowCardinalty(String)`. In case of archive shows name of the file inside the archive.
- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. In case of archive shows uncompressed file size of the file inside the archive.
- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`.
## Storage Settings {#storage-settings}

View File

@ -75,6 +75,7 @@ struct RelativePathWithMetadata
virtual std::string getPath() const { return relative_path; }
virtual bool isArchive() const { return false; }
virtual std::string getPathToArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); }
virtual size_t fileSizeInArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); }
};
struct ObjectKeyWithMetadata

View File

@ -196,13 +196,12 @@ Chunk StorageObjectStorageSource::generate()
const auto & filename = object_info->getFileName();
chassert(object_info->metadata);
VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk(
chunk, read_from_format_info.requested_virtual_columns,
{
.path = getUniqueStoragePathIdentifier(*configuration, *object_info, false),
.size = object_info->metadata->size_bytes,
.filename = &filename,
.last_modified = object_info->metadata->last_modified
});
chunk,
read_from_format_info.requested_virtual_columns,
{.path = getUniqueStoragePathIdentifier(*configuration, *object_info, false),
.size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes,
.filename = &filename,
.last_modified = object_info->metadata->last_modified});
const auto & partition_columns = configuration->getPartitionColumns();
if (!partition_columns.empty() && chunk_size && chunk.hasColumns())
@ -227,7 +226,6 @@ Chunk StorageObjectStorageSource::generate()
chunk.addColumn(std::move(partition_column));
}
}
return chunk;
}
@ -715,10 +713,9 @@ static IArchiveReader::NameFilter createArchivePathFilter(const std::string & ar
StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive::ObjectInfoInArchive(
ObjectInfoPtr archive_object_,
const std::string & path_in_archive_,
std::shared_ptr<IArchiveReader> archive_reader_)
: archive_object(archive_object_)
, path_in_archive(path_in_archive_)
, archive_reader(archive_reader_)
std::shared_ptr<IArchiveReader> archive_reader_,
IArchiveReader::FileInfo && file_info_)
: archive_object(archive_object_), path_in_archive(path_in_archive_), archive_reader(archive_reader_), file_info(file_info_)
{
}
@ -757,6 +754,7 @@ StorageObjectStorageSource::ObjectInfoPtr
StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor)
{
std::unique_lock lock{next_mutex};
IArchiveReader::FileInfo current_file_info{};
while (true)
{
if (filter)
@ -781,6 +779,8 @@ StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor)
path_in_archive = file_enumerator->getFileName();
if (!filter(path_in_archive))
continue;
else
current_file_info = file_enumerator->getFileInfo();
}
else
{
@ -794,15 +794,19 @@ StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor)
archive_reader = createArchiveReader(archive_object);
if (!archive_reader->fileExists(path_in_archive))
continue;
else
current_file_info = archive_reader->getFileInfo(path_in_archive);
}
auto object_in_archive = std::make_shared<ObjectInfoInArchive>(archive_object, path_in_archive, archive_reader);
if (read_keys != nullptr)
read_keys->push_back(object_in_archive);
return object_in_archive;
break;
}
auto object_in_archive
= std::make_shared<ObjectInfoInArchive>(archive_object, path_in_archive, archive_reader, std::move(current_file_info));
if (read_keys != nullptr)
read_keys->push_back(object_in_archive);
return object_in_archive;
}
size_t StorageObjectStorageSource::ArchiveIterator::estimatedKeysCount()

View File

@ -260,7 +260,8 @@ public:
ObjectInfoInArchive(
ObjectInfoPtr archive_object_,
const std::string & path_in_archive_,
std::shared_ptr<IArchiveReader> archive_reader_);
std::shared_ptr<IArchiveReader> archive_reader_,
IArchiveReader::FileInfo && file_info_);
std::string getFileName() const override
{
@ -279,9 +280,12 @@ public:
bool isArchive() const override { return true; }
size_t fileSizeInArchive() const override { return file_info.uncompressed_size; }
const ObjectInfoPtr archive_object;
const std::string path_in_archive;
const std::shared_ptr<IArchiveReader> archive_reader;
const IArchiveReader::FileInfo file_info;
};
private:

View File

@ -1,52 +1,52 @@
1 Str1 example1.csv test/03036_archive1.zip::example1.csv
2 Str2 example1.csv test/03036_archive1.zip::example1.csv
3 Str3 example2.csv test/03036_archive2.zip::example2.csv
4 Str4 example2.csv test/03036_archive2.zip::example2.csv
5 Str5 example3.csv test/03036_archive2.zip::example3.csv
6 Str6 example3.csv test/03036_archive2.zip::example3.csv
3 Str3 example2.csv test/03036_archive1.zip::example2.csv
3 Str3 example2.csv test/03036_archive2.zip::example2.csv
4 Str4 example2.csv test/03036_archive1.zip::example2.csv
4 Str4 example2.csv test/03036_archive2.zip::example2.csv
1 Str1 example1.csv test/03036_archive1.zip::example1.csv
2 Str2 example1.csv test/03036_archive1.zip::example1.csv
3 Str3 example2.csv test/03036_archive1.zip::example2.csv
3 Str3 example2.csv test/03036_archive2.zip::example2.csv
4 Str4 example2.csv test/03036_archive1.zip::example2.csv
4 Str4 example2.csv test/03036_archive2.zip::example2.csv
5 Str5 example3.csv test/03036_archive2.zip::example3.csv
6 Str6 example3.csv test/03036_archive2.zip::example3.csv
1 Str1 example1.csv test/03036_archive1.tar::example1.csv
2 Str2 example1.csv test/03036_archive1.tar::example1.csv
7 Str7 example4.csv test/03036_archive1.tar::example4.csv
7 Str7 example4.csv test/03036_archive2.tar::example4.csv
8 Str8 example4.csv test/03036_archive1.tar::example4.csv
8 Str8 example4.csv test/03036_archive2.tar::example4.csv
5 Str5 example3.csv test/03036_archive2.tar::example3.csv
6 Str6 example3.csv test/03036_archive2.tar::example3.csv
7 Str7 example4.csv test/03036_archive2.tar::example4.csv
8 Str8 example4.csv test/03036_archive2.tar::example4.csv
9 Str9 example5.csv test/03036_archive2.tar::example5.csv
10 Str10 example5.csv test/03036_archive2.tar::example5.csv
3 Str3 example2.csv test/03036_archive3.tar.gz::example2.csv
4 Str4 example2.csv test/03036_archive3.tar.gz::example2.csv
11 Str11 example6.csv test/03036_archive3.tar.gz::example6.csv
12 Str12 example6.csv test/03036_archive3.tar.gz::example6.csv
3 Str3 example2.csv test/03036_archive3.tar.gz::example2.csv
4 Str4 example2.csv test/03036_archive3.tar.gz::example2.csv
5 Str5 example3.csv test/03036_archive2.tar::example3.csv
6 Str6 example3.csv test/03036_archive2.tar::example3.csv
3 Str3 example2.csv test/03036_archive2.zip::example2.csv
4 Str4 example2.csv test/03036_archive2.zip::example2.csv
5 Str5 example3.csv test/03036_archive2.tar::example3.csv
6 Str6 example3.csv test/03036_archive2.tar::example3.csv
7 Str7 example4.csv test/03036_archive2.tar::example4.csv
8 Str8 example4.csv test/03036_archive2.tar::example4.csv
9 Str9 example5.csv test/03036_archive2.tar::example5.csv
10 Str10 example5.csv test/03036_archive2.tar::example5.csv
3 Str3 example2.csv test/03036_archive3.tar.gz::example2.csv
4 Str4 example2.csv test/03036_archive3.tar.gz::example2.csv
5 Str5 example3.csv test/03036_archive2.tar::example3.csv
6 Str6 example3.csv test/03036_archive2.tar::example3.csv
13 Str13 example7.csv test/03036_compressed_file_archive.zip::example7.csv
14 Str14 example7.csv test/03036_compressed_file_archive.zip::example7.csv
1 Str1 25 example1.csv test/03036_archive1.zip::example1.csv
2 Str2 25 example1.csv test/03036_archive1.zip::example1.csv
3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv
4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv
5 Str5 25 example3.csv test/03036_archive2.zip::example3.csv
6 Str6 25 example3.csv test/03036_archive2.zip::example3.csv
3 Str3 25 example2.csv test/03036_archive1.zip::example2.csv
3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv
4 Str4 25 example2.csv test/03036_archive1.zip::example2.csv
4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv
1 Str1 25 example1.csv test/03036_archive1.zip::example1.csv
2 Str2 25 example1.csv test/03036_archive1.zip::example1.csv
3 Str3 25 example2.csv test/03036_archive1.zip::example2.csv
3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv
4 Str4 25 example2.csv test/03036_archive1.zip::example2.csv
4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv
5 Str5 25 example3.csv test/03036_archive2.zip::example3.csv
6 Str6 25 example3.csv test/03036_archive2.zip::example3.csv
1 Str1 25 example1.csv test/03036_archive1.tar::example1.csv
2 Str2 25 example1.csv test/03036_archive1.tar::example1.csv
7 Str7 25 example4.csv test/03036_archive1.tar::example4.csv
7 Str7 25 example4.csv test/03036_archive2.tar::example4.csv
8 Str8 25 example4.csv test/03036_archive1.tar::example4.csv
8 Str8 25 example4.csv test/03036_archive2.tar::example4.csv
5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv
6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv
7 Str7 25 example4.csv test/03036_archive2.tar::example4.csv
8 Str8 25 example4.csv test/03036_archive2.tar::example4.csv
9 Str9 27 example5.csv test/03036_archive2.tar::example5.csv
10 Str10 27 example5.csv test/03036_archive2.tar::example5.csv
3 Str3 25 example2.csv test/03036_archive3.tar.gz::example2.csv
4 Str4 25 example2.csv test/03036_archive3.tar.gz::example2.csv
11 Str11 29 example6.csv test/03036_archive3.tar.gz::example6.csv
12 Str12 29 example6.csv test/03036_archive3.tar.gz::example6.csv
3 Str3 25 example2.csv test/03036_archive3.tar.gz::example2.csv
4 Str4 25 example2.csv test/03036_archive3.tar.gz::example2.csv
5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv
6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv
3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv
4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv
5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv
6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv
7 Str7 25 example4.csv test/03036_archive2.tar::example4.csv
8 Str8 25 example4.csv test/03036_archive2.tar::example4.csv
9 Str9 27 example5.csv test/03036_archive2.tar::example5.csv
10 Str10 27 example5.csv test/03036_archive2.tar::example5.csv
3 Str3 25 example2.csv test/03036_archive3.tar.gz::example2.csv
4 Str4 25 example2.csv test/03036_archive3.tar.gz::example2.csv
5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv
6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv
13 Str13 57 example7.csv test/03036_compressed_file_archive.zip::example7.csv
14 Str14 57 example7.csv test/03036_compressed_file_archive.zip::example7.csv

View File

@ -1,22 +1,22 @@
-- Tags: no-fasttest
-- Tag no-fasttest: Depends on AWS
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive1.zip :: example1.csv') ORDER BY (id, _file, _path);
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive2.zip :: example*.csv') ORDER BY (id, _file, _path);
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example2.csv') ORDER BY (id, _file, _path);
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example*') ORDER BY (id, _file, _path);
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive1.tar :: example1.csv') ORDER BY (id, _file, _path);
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar :: example4.csv') ORDER BY (id, _file, _path);
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive2.tar :: example*.csv') ORDER BY (id, _file, _path);
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar.gz :: example*.csv') ORDER BY (id, _file, _path);
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv') ORDER BY (id, _file, _path);
select id, data, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
select id, data, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent{2..3}.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive1.zip :: example1.csv') ORDER BY (id, _file, _path);
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive2.zip :: example*.csv') ORDER BY (id, _file, _path);
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example2.csv') ORDER BY (id, _file, _path);
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example*') ORDER BY (id, _file, _path);
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive1.tar :: example1.csv') ORDER BY (id, _file, _path);
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar :: example4.csv') ORDER BY (id, _file, _path);
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive2.tar :: example*.csv') ORDER BY (id, _file, _path);
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar.gz :: example*.csv') ORDER BY (id, _file, _path);
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv') ORDER BY (id, _file, _path);
select id, data, _size, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
select id, data, _size, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent{2..3}.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
CREATE TABLE table_zip22 Engine S3(s3_conn, filename='03036_archive2.zip :: example2.csv');
select id, data, _file, _path from table_zip22 ORDER BY (id, _file, _path);
select id, data, _size, _file, _path from table_zip22 ORDER BY (id, _file, _path);
CREATE table table_tar2star Engine S3(s3_conn, filename='03036_archive2.tar :: example*.csv');
SELECT id, data, _file, _path FROM table_tar2star ORDER BY (id, _file, _path);
SELECT id, data, _size, _file, _path FROM table_tar2star ORDER BY (id, _file, _path);
CREATE table table_tarstarglobs Engine S3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv');
SELECT id, data, _file, _path FROM table_tarstarglobs ORDER BY (id, _file, _path);
SELECT id, data, _size, _file, _path FROM table_tarstarglobs ORDER BY (id, _file, _path);
CREATE table table_noexist Engine s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError UNKNOWN_STORAGE }
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_compressed_file_archive.zip :: example7.csv', format='CSV', structure='auto', compression_method='gz') ORDER BY (id, _file, _path)
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_compressed_file_archive.zip :: example7.csv', format='CSV', structure='auto', compression_method='gz') ORDER BY (id, _file, _path)