mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 16:42:05 +00:00
Merge pull request #65993 from ClickHouse/divanik/fix_size_column_in_s3_archives
Fix bug with _size of archives in s3 table function
This commit is contained in:
commit
4ab2d514e6
@ -269,9 +269,9 @@ FROM s3(
|
||||
|
||||
## Virtual Columns {#virtual-columns}
|
||||
|
||||
- `_path` — Path to the file. Type: `LowCardinalty(String)`.
|
||||
- `_file` — Name of the file. Type: `LowCardinalty(String)`.
|
||||
- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`.
|
||||
- `_path` — Path to the file. Type: `LowCardinalty(String)`. In case of archive, shows path in a format: "{path_to_archive}::{path_to_file_inside_archive}"
|
||||
- `_file` — Name of the file. Type: `LowCardinalty(String)`. In case of archive shows name of the file inside the archive.
|
||||
- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. In case of archive shows uncompressed file size of the file inside the archive.
|
||||
- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`.
|
||||
|
||||
## Storage Settings {#storage-settings}
|
||||
|
@ -75,6 +75,7 @@ struct RelativePathWithMetadata
|
||||
virtual std::string getPath() const { return relative_path; }
|
||||
virtual bool isArchive() const { return false; }
|
||||
virtual std::string getPathToArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); }
|
||||
virtual size_t fileSizeInArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); }
|
||||
};
|
||||
|
||||
struct ObjectKeyWithMetadata
|
||||
|
@ -196,13 +196,12 @@ Chunk StorageObjectStorageSource::generate()
|
||||
const auto & filename = object_info->getFileName();
|
||||
chassert(object_info->metadata);
|
||||
VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk(
|
||||
chunk, read_from_format_info.requested_virtual_columns,
|
||||
{
|
||||
.path = getUniqueStoragePathIdentifier(*configuration, *object_info, false),
|
||||
.size = object_info->metadata->size_bytes,
|
||||
.filename = &filename,
|
||||
.last_modified = object_info->metadata->last_modified
|
||||
});
|
||||
chunk,
|
||||
read_from_format_info.requested_virtual_columns,
|
||||
{.path = getUniqueStoragePathIdentifier(*configuration, *object_info, false),
|
||||
.size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes,
|
||||
.filename = &filename,
|
||||
.last_modified = object_info->metadata->last_modified});
|
||||
|
||||
const auto & partition_columns = configuration->getPartitionColumns();
|
||||
if (!partition_columns.empty() && chunk_size && chunk.hasColumns())
|
||||
@ -227,7 +226,6 @@ Chunk StorageObjectStorageSource::generate()
|
||||
chunk.addColumn(std::move(partition_column));
|
||||
}
|
||||
}
|
||||
|
||||
return chunk;
|
||||
}
|
||||
|
||||
@ -715,10 +713,9 @@ static IArchiveReader::NameFilter createArchivePathFilter(const std::string & ar
|
||||
StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive::ObjectInfoInArchive(
|
||||
ObjectInfoPtr archive_object_,
|
||||
const std::string & path_in_archive_,
|
||||
std::shared_ptr<IArchiveReader> archive_reader_)
|
||||
: archive_object(archive_object_)
|
||||
, path_in_archive(path_in_archive_)
|
||||
, archive_reader(archive_reader_)
|
||||
std::shared_ptr<IArchiveReader> archive_reader_,
|
||||
IArchiveReader::FileInfo && file_info_)
|
||||
: archive_object(archive_object_), path_in_archive(path_in_archive_), archive_reader(archive_reader_), file_info(file_info_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -757,6 +754,7 @@ StorageObjectStorageSource::ObjectInfoPtr
|
||||
StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor)
|
||||
{
|
||||
std::unique_lock lock{next_mutex};
|
||||
IArchiveReader::FileInfo current_file_info{};
|
||||
while (true)
|
||||
{
|
||||
if (filter)
|
||||
@ -781,6 +779,8 @@ StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor)
|
||||
path_in_archive = file_enumerator->getFileName();
|
||||
if (!filter(path_in_archive))
|
||||
continue;
|
||||
else
|
||||
current_file_info = file_enumerator->getFileInfo();
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -794,15 +794,19 @@ StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor)
|
||||
archive_reader = createArchiveReader(archive_object);
|
||||
if (!archive_reader->fileExists(path_in_archive))
|
||||
continue;
|
||||
else
|
||||
current_file_info = archive_reader->getFileInfo(path_in_archive);
|
||||
}
|
||||
|
||||
auto object_in_archive = std::make_shared<ObjectInfoInArchive>(archive_object, path_in_archive, archive_reader);
|
||||
|
||||
if (read_keys != nullptr)
|
||||
read_keys->push_back(object_in_archive);
|
||||
|
||||
return object_in_archive;
|
||||
break;
|
||||
}
|
||||
|
||||
auto object_in_archive
|
||||
= std::make_shared<ObjectInfoInArchive>(archive_object, path_in_archive, archive_reader, std::move(current_file_info));
|
||||
|
||||
if (read_keys != nullptr)
|
||||
read_keys->push_back(object_in_archive);
|
||||
|
||||
return object_in_archive;
|
||||
}
|
||||
|
||||
size_t StorageObjectStorageSource::ArchiveIterator::estimatedKeysCount()
|
||||
|
@ -260,7 +260,8 @@ public:
|
||||
ObjectInfoInArchive(
|
||||
ObjectInfoPtr archive_object_,
|
||||
const std::string & path_in_archive_,
|
||||
std::shared_ptr<IArchiveReader> archive_reader_);
|
||||
std::shared_ptr<IArchiveReader> archive_reader_,
|
||||
IArchiveReader::FileInfo && file_info_);
|
||||
|
||||
std::string getFileName() const override
|
||||
{
|
||||
@ -279,9 +280,12 @@ public:
|
||||
|
||||
bool isArchive() const override { return true; }
|
||||
|
||||
size_t fileSizeInArchive() const override { return file_info.uncompressed_size; }
|
||||
|
||||
const ObjectInfoPtr archive_object;
|
||||
const std::string path_in_archive;
|
||||
const std::shared_ptr<IArchiveReader> archive_reader;
|
||||
const IArchiveReader::FileInfo file_info;
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -1,52 +1,52 @@
|
||||
1 Str1 example1.csv test/03036_archive1.zip::example1.csv
|
||||
2 Str2 example1.csv test/03036_archive1.zip::example1.csv
|
||||
3 Str3 example2.csv test/03036_archive2.zip::example2.csv
|
||||
4 Str4 example2.csv test/03036_archive2.zip::example2.csv
|
||||
5 Str5 example3.csv test/03036_archive2.zip::example3.csv
|
||||
6 Str6 example3.csv test/03036_archive2.zip::example3.csv
|
||||
3 Str3 example2.csv test/03036_archive1.zip::example2.csv
|
||||
3 Str3 example2.csv test/03036_archive2.zip::example2.csv
|
||||
4 Str4 example2.csv test/03036_archive1.zip::example2.csv
|
||||
4 Str4 example2.csv test/03036_archive2.zip::example2.csv
|
||||
1 Str1 example1.csv test/03036_archive1.zip::example1.csv
|
||||
2 Str2 example1.csv test/03036_archive1.zip::example1.csv
|
||||
3 Str3 example2.csv test/03036_archive1.zip::example2.csv
|
||||
3 Str3 example2.csv test/03036_archive2.zip::example2.csv
|
||||
4 Str4 example2.csv test/03036_archive1.zip::example2.csv
|
||||
4 Str4 example2.csv test/03036_archive2.zip::example2.csv
|
||||
5 Str5 example3.csv test/03036_archive2.zip::example3.csv
|
||||
6 Str6 example3.csv test/03036_archive2.zip::example3.csv
|
||||
1 Str1 example1.csv test/03036_archive1.tar::example1.csv
|
||||
2 Str2 example1.csv test/03036_archive1.tar::example1.csv
|
||||
7 Str7 example4.csv test/03036_archive1.tar::example4.csv
|
||||
7 Str7 example4.csv test/03036_archive2.tar::example4.csv
|
||||
8 Str8 example4.csv test/03036_archive1.tar::example4.csv
|
||||
8 Str8 example4.csv test/03036_archive2.tar::example4.csv
|
||||
5 Str5 example3.csv test/03036_archive2.tar::example3.csv
|
||||
6 Str6 example3.csv test/03036_archive2.tar::example3.csv
|
||||
7 Str7 example4.csv test/03036_archive2.tar::example4.csv
|
||||
8 Str8 example4.csv test/03036_archive2.tar::example4.csv
|
||||
9 Str9 example5.csv test/03036_archive2.tar::example5.csv
|
||||
10 Str10 example5.csv test/03036_archive2.tar::example5.csv
|
||||
3 Str3 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
4 Str4 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
11 Str11 example6.csv test/03036_archive3.tar.gz::example6.csv
|
||||
12 Str12 example6.csv test/03036_archive3.tar.gz::example6.csv
|
||||
3 Str3 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
4 Str4 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
5 Str5 example3.csv test/03036_archive2.tar::example3.csv
|
||||
6 Str6 example3.csv test/03036_archive2.tar::example3.csv
|
||||
3 Str3 example2.csv test/03036_archive2.zip::example2.csv
|
||||
4 Str4 example2.csv test/03036_archive2.zip::example2.csv
|
||||
5 Str5 example3.csv test/03036_archive2.tar::example3.csv
|
||||
6 Str6 example3.csv test/03036_archive2.tar::example3.csv
|
||||
7 Str7 example4.csv test/03036_archive2.tar::example4.csv
|
||||
8 Str8 example4.csv test/03036_archive2.tar::example4.csv
|
||||
9 Str9 example5.csv test/03036_archive2.tar::example5.csv
|
||||
10 Str10 example5.csv test/03036_archive2.tar::example5.csv
|
||||
3 Str3 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
4 Str4 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
5 Str5 example3.csv test/03036_archive2.tar::example3.csv
|
||||
6 Str6 example3.csv test/03036_archive2.tar::example3.csv
|
||||
13 Str13 example7.csv test/03036_compressed_file_archive.zip::example7.csv
|
||||
14 Str14 example7.csv test/03036_compressed_file_archive.zip::example7.csv
|
||||
1 Str1 25 example1.csv test/03036_archive1.zip::example1.csv
|
||||
2 Str2 25 example1.csv test/03036_archive1.zip::example1.csv
|
||||
3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv
|
||||
4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv
|
||||
5 Str5 25 example3.csv test/03036_archive2.zip::example3.csv
|
||||
6 Str6 25 example3.csv test/03036_archive2.zip::example3.csv
|
||||
3 Str3 25 example2.csv test/03036_archive1.zip::example2.csv
|
||||
3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv
|
||||
4 Str4 25 example2.csv test/03036_archive1.zip::example2.csv
|
||||
4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv
|
||||
1 Str1 25 example1.csv test/03036_archive1.zip::example1.csv
|
||||
2 Str2 25 example1.csv test/03036_archive1.zip::example1.csv
|
||||
3 Str3 25 example2.csv test/03036_archive1.zip::example2.csv
|
||||
3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv
|
||||
4 Str4 25 example2.csv test/03036_archive1.zip::example2.csv
|
||||
4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv
|
||||
5 Str5 25 example3.csv test/03036_archive2.zip::example3.csv
|
||||
6 Str6 25 example3.csv test/03036_archive2.zip::example3.csv
|
||||
1 Str1 25 example1.csv test/03036_archive1.tar::example1.csv
|
||||
2 Str2 25 example1.csv test/03036_archive1.tar::example1.csv
|
||||
7 Str7 25 example4.csv test/03036_archive1.tar::example4.csv
|
||||
7 Str7 25 example4.csv test/03036_archive2.tar::example4.csv
|
||||
8 Str8 25 example4.csv test/03036_archive1.tar::example4.csv
|
||||
8 Str8 25 example4.csv test/03036_archive2.tar::example4.csv
|
||||
5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv
|
||||
6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv
|
||||
7 Str7 25 example4.csv test/03036_archive2.tar::example4.csv
|
||||
8 Str8 25 example4.csv test/03036_archive2.tar::example4.csv
|
||||
9 Str9 27 example5.csv test/03036_archive2.tar::example5.csv
|
||||
10 Str10 27 example5.csv test/03036_archive2.tar::example5.csv
|
||||
3 Str3 25 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
4 Str4 25 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
11 Str11 29 example6.csv test/03036_archive3.tar.gz::example6.csv
|
||||
12 Str12 29 example6.csv test/03036_archive3.tar.gz::example6.csv
|
||||
3 Str3 25 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
4 Str4 25 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv
|
||||
6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv
|
||||
3 Str3 25 example2.csv test/03036_archive2.zip::example2.csv
|
||||
4 Str4 25 example2.csv test/03036_archive2.zip::example2.csv
|
||||
5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv
|
||||
6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv
|
||||
7 Str7 25 example4.csv test/03036_archive2.tar::example4.csv
|
||||
8 Str8 25 example4.csv test/03036_archive2.tar::example4.csv
|
||||
9 Str9 27 example5.csv test/03036_archive2.tar::example5.csv
|
||||
10 Str10 27 example5.csv test/03036_archive2.tar::example5.csv
|
||||
3 Str3 25 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
4 Str4 25 example2.csv test/03036_archive3.tar.gz::example2.csv
|
||||
5 Str5 25 example3.csv test/03036_archive2.tar::example3.csv
|
||||
6 Str6 25 example3.csv test/03036_archive2.tar::example3.csv
|
||||
13 Str13 57 example7.csv test/03036_compressed_file_archive.zip::example7.csv
|
||||
14 Str14 57 example7.csv test/03036_compressed_file_archive.zip::example7.csv
|
||||
|
@ -1,22 +1,22 @@
|
||||
-- Tags: no-fasttest
|
||||
-- Tag no-fasttest: Depends on AWS
|
||||
|
||||
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive1.zip :: example1.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive2.zip :: example*.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example2.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example*') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive1.tar :: example1.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar :: example4.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive2.tar :: example*.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar.gz :: example*.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv') ORDER BY (id, _file, _path);
|
||||
select id, data, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
|
||||
select id, data, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent{2..3}.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
|
||||
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive1.zip :: example1.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive2.zip :: example*.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example2.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.zip :: example*') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive1.tar :: example1.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar :: example4.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive2.tar :: example*.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar.gz :: example*.csv') ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv') ORDER BY (id, _file, _path);
|
||||
select id, data, _size, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
|
||||
select id, data, _size, _file, _path from s3(s3_conn, filename='03036_archive2.zip :: nonexistent{2..3}.csv'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
|
||||
CREATE TABLE table_zip22 Engine S3(s3_conn, filename='03036_archive2.zip :: example2.csv');
|
||||
select id, data, _file, _path from table_zip22 ORDER BY (id, _file, _path);
|
||||
select id, data, _size, _file, _path from table_zip22 ORDER BY (id, _file, _path);
|
||||
CREATE table table_tar2star Engine S3(s3_conn, filename='03036_archive2.tar :: example*.csv');
|
||||
SELECT id, data, _file, _path FROM table_tar2star ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _size, _file, _path FROM table_tar2star ORDER BY (id, _file, _path);
|
||||
CREATE table table_tarstarglobs Engine S3(s3_conn, filename='03036_archive*.tar* :: example{2..3}.csv');
|
||||
SELECT id, data, _file, _path FROM table_tarstarglobs ORDER BY (id, _file, _path);
|
||||
SELECT id, data, _size, _file, _path FROM table_tarstarglobs ORDER BY (id, _file, _path);
|
||||
CREATE table table_noexist Engine s3(s3_conn, filename='03036_archive2.zip :: nonexistent.csv'); -- { serverError UNKNOWN_STORAGE }
|
||||
SELECT id, data, _file, _path FROM s3(s3_conn, filename='03036_compressed_file_archive.zip :: example7.csv', format='CSV', structure='auto', compression_method='gz') ORDER BY (id, _file, _path)
|
||||
SELECT id, data, _size, _file, _path FROM s3(s3_conn, filename='03036_compressed_file_archive.zip :: example7.csv', format='CSV', structure='auto', compression_method='gz') ORDER BY (id, _file, _path)
|
||||
|
Loading…
Reference in New Issue
Block a user