mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-29 19:12:03 +00:00
Add support for file globs
This commit is contained in:
parent
2430ebabf5
commit
9423976b7a
@ -45,16 +45,20 @@ public:
|
||||
/// Starts enumerating files in the archive.
|
||||
virtual std::unique_ptr<FileEnumerator> firstFile() = 0;
|
||||
|
||||
using NameFilter = std::function<bool(const std::string &)>;
|
||||
|
||||
/// Starts reading a file from the archive. The function returns a read buffer,
|
||||
/// you can read that buffer to extract uncompressed data from the archive.
|
||||
/// Several read buffers can be used at the same time in parallel.
|
||||
virtual std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) = 0;
|
||||
virtual std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter) = 0;
|
||||
|
||||
/// It's possible to convert a file enumerator to a read buffer and vice versa.
|
||||
virtual std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) = 0;
|
||||
virtual std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) = 0;
|
||||
|
||||
virtual std::vector<std::string> getAllFiles() = 0;
|
||||
virtual std::vector<std::string> getAllFiles(NameFilter filter) = 0;
|
||||
|
||||
/// Sets password used to decrypt files in the archive.
|
||||
virtual void setPassword(const String & /* password */) {}
|
||||
|
@ -49,7 +49,12 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
bool locateFile(const String & filename)
|
||||
bool locateFile(const std::string & filename)
|
||||
{
|
||||
return locateFile([&](const std::string & file) { return file == filename; });
|
||||
}
|
||||
|
||||
bool locateFile(NameFilter filter)
|
||||
{
|
||||
resetFileInfo();
|
||||
int err = ARCHIVE_OK;
|
||||
@ -63,7 +68,7 @@ public:
|
||||
if (err != ARCHIVE_OK)
|
||||
break;
|
||||
|
||||
if (archive_entry_pathname(current_entry) == filename)
|
||||
if (filter(archive_entry_pathname(current_entry)))
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -95,7 +100,7 @@ public:
|
||||
return archive;
|
||||
}
|
||||
|
||||
std::vector<std::string> getAllFiles()
|
||||
std::vector<std::string> getAllFiles(NameFilter filter)
|
||||
{
|
||||
auto * archive = open(path_to_archive);
|
||||
auto * entry = archive_entry_new();
|
||||
@ -104,7 +109,10 @@ public:
|
||||
int error = archive_read_next_header(archive, &entry);
|
||||
while (error == ARCHIVE_OK || error == ARCHIVE_RETRY)
|
||||
{
|
||||
files.push_back(archive_entry_pathname(entry));
|
||||
std::string name = archive_entry_pathname(entry);
|
||||
if (!filter || filter(name))
|
||||
files.push_back(std::move(name));
|
||||
|
||||
error = archive_read_next_header(archive, &entry);
|
||||
}
|
||||
|
||||
@ -262,9 +270,15 @@ std::unique_ptr<typename LibArchiveReader<ArchiveInfo>::FileEnumerator> LibArchi
|
||||
|
||||
template <typename ArchiveInfo>
|
||||
std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader<ArchiveInfo>::readFile(const String & filename)
|
||||
{
|
||||
return readFile([&](const std::string & file) { return file == filename; });
|
||||
}
|
||||
|
||||
template <typename ArchiveInfo>
|
||||
std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader<ArchiveInfo>::readFile(NameFilter filter)
|
||||
{
|
||||
Handle handle(path_to_archive);
|
||||
handle.locateFile(filename);
|
||||
handle.locateFile(filter);
|
||||
return std::make_unique<ReadBufferFromLibArchive>(std::move(handle), path_to_archive);
|
||||
}
|
||||
|
||||
@ -292,9 +306,15 @@ LibArchiveReader<ArchiveInfo>::nextFile(std::unique_ptr<ReadBuffer> read_buffer)
|
||||
|
||||
template <typename ArchiveInfo>
|
||||
std::vector<std::string> LibArchiveReader<ArchiveInfo>::getAllFiles()
|
||||
{
|
||||
return getAllFiles({});
|
||||
}
|
||||
|
||||
template <typename ArchiveInfo>
|
||||
std::vector<std::string> LibArchiveReader<ArchiveInfo>::getAllFiles(NameFilter filter)
|
||||
{
|
||||
Handle handle(path_to_archive);
|
||||
return handle.getAllFiles();
|
||||
return handle.getAllFiles(filter);
|
||||
}
|
||||
|
||||
template <typename ArchiveInfo>
|
||||
|
@ -40,12 +40,14 @@ public:
|
||||
/// you can read that buffer to extract uncompressed data from the archive.
|
||||
/// Several read buffers can be used at the same time in parallel.
|
||||
std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) override;
|
||||
std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter) override;
|
||||
|
||||
/// It's possible to convert a file enumerator to a read buffer and vice versa.
|
||||
std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) override;
|
||||
std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) override;
|
||||
|
||||
std::vector<std::string> getAllFiles() override;
|
||||
std::vector<std::string> getAllFiles(NameFilter filter) override;
|
||||
|
||||
/// Sets password used to decrypt the contents of the files in the archive.
|
||||
void setPassword(const String & password_) override;
|
||||
|
@ -86,6 +86,26 @@ public:
|
||||
file_name = file_name_;
|
||||
}
|
||||
|
||||
void locateFile(NameFilter filter)
|
||||
{
|
||||
int err = unzGoToFirstFile(raw_handle);
|
||||
if (err == UNZ_END_OF_LIST_OF_FILE)
|
||||
showError("No file was found satisfying the filter");
|
||||
|
||||
do
|
||||
{
|
||||
checkResult(err);
|
||||
resetFileInfo();
|
||||
retrieveFileInfo();
|
||||
if (filter(getFileName()))
|
||||
return;
|
||||
|
||||
err = unzGoToNextFile(raw_handle);
|
||||
} while (err != UNZ_END_OF_LIST_OF_FILE);
|
||||
|
||||
showError("No file was found satisfying the filter");
|
||||
}
|
||||
|
||||
bool tryLocateFile(const String & file_name_)
|
||||
{
|
||||
resetFileInfo();
|
||||
@ -132,7 +152,7 @@ public:
|
||||
return *file_info;
|
||||
}
|
||||
|
||||
std::vector<std::string> getAllFiles()
|
||||
std::vector<std::string> getAllFiles(NameFilter filter)
|
||||
{
|
||||
std::vector<std::string> files;
|
||||
resetFileInfo();
|
||||
@ -145,7 +165,8 @@ public:
|
||||
checkResult(err);
|
||||
resetFileInfo();
|
||||
retrieveFileInfo();
|
||||
files.push_back(*file_name);
|
||||
if (!filter || filter(getFileName()))
|
||||
files.push_back(*file_name);
|
||||
err = unzGoToNextFile(raw_handle);
|
||||
} while (err != UNZ_END_OF_LIST_OF_FILE);
|
||||
|
||||
@ -512,6 +533,13 @@ std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(const String
|
||||
return std::make_unique<ReadBufferFromZipArchive>(std::move(handle));
|
||||
}
|
||||
|
||||
std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(NameFilter filter)
|
||||
{
|
||||
auto handle = acquireHandle();
|
||||
handle.locateFile(filter);
|
||||
return std::make_unique<ReadBufferFromZipArchive>(std::move(handle));
|
||||
}
|
||||
|
||||
std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(std::unique_ptr<FileEnumerator> enumerator)
|
||||
{
|
||||
if (!dynamic_cast<FileEnumeratorImpl *>(enumerator.get()))
|
||||
@ -533,9 +561,14 @@ std::unique_ptr<ZipArchiveReader::FileEnumerator> ZipArchiveReader::nextFile(std
|
||||
}
|
||||
|
||||
std::vector<std::string> ZipArchiveReader::getAllFiles()
|
||||
{
|
||||
return getAllFiles({});
|
||||
}
|
||||
|
||||
std::vector<std::string> ZipArchiveReader::getAllFiles(NameFilter filter)
|
||||
{
|
||||
auto handle = acquireHandle();
|
||||
return handle.getAllFiles();
|
||||
return handle.getAllFiles(filter);
|
||||
}
|
||||
|
||||
void ZipArchiveReader::setPassword(const String & password_)
|
||||
|
@ -42,12 +42,14 @@ public:
|
||||
/// you can read that buffer to extract uncompressed data from the archive.
|
||||
/// Several read buffers can be used at the same time in parallel.
|
||||
std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) override;
|
||||
std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter) override;
|
||||
|
||||
/// It's possible to convert a file enumerator to a read buffer and vice versa.
|
||||
std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) override;
|
||||
std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) override;
|
||||
|
||||
std::vector<std::string> getAllFiles() override;
|
||||
std::vector<std::string> getAllFiles(NameFilter filter) override;
|
||||
|
||||
/// Sets password used to decrypt the contents of the files in the archive.
|
||||
void setPassword(const String & password_) override;
|
||||
|
@ -57,7 +57,6 @@
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
namespace ProfileEvents
|
||||
{
|
||||
extern const Event CreatedReadBufferOrdinary;
|
||||
@ -387,7 +386,23 @@ std::unique_ptr<ReadBuffer> createReadBuffer(
|
||||
if (!path_to_archive.empty())
|
||||
{
|
||||
auto reader = createArchiveReader(path_to_archive);
|
||||
return reader->readFile(current_path);
|
||||
|
||||
if (current_path.find_first_of("*?{") != std::string::npos)
|
||||
{
|
||||
auto matcher = std::make_shared<re2::RE2>(makeRegexpPatternFromGlobs(current_path));
|
||||
if (!matcher->ok())
|
||||
throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP,
|
||||
"Cannot compile regex from glob ({}): {}", current_path, matcher->error());
|
||||
|
||||
return reader->readFile([matcher = std::move(matcher)](const std::string & path)
|
||||
{
|
||||
return re2::RE2::FullMatch(path, *matcher);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
return reader->readFile(current_path);
|
||||
}
|
||||
}
|
||||
|
||||
if (use_table_fd)
|
||||
@ -529,14 +544,30 @@ ColumnsDescription StorageFile::getTableStructureFromFile(
|
||||
}
|
||||
else
|
||||
{
|
||||
read_buffer_iterator = [&, path_it = paths.begin(), archive_it = paths_to_archive.begin()](ColumnsDescription &) mutable -> std::unique_ptr<ReadBuffer>
|
||||
read_buffer_iterator = [&, path_it = paths.begin(), archive_it = paths_to_archive.begin(), first = true](ColumnsDescription &) mutable -> std::unique_ptr<ReadBuffer>
|
||||
{
|
||||
if (archive_it == paths_to_archive.end())
|
||||
return nullptr;
|
||||
String path;
|
||||
struct stat file_stat;
|
||||
do
|
||||
{
|
||||
if (archive_it == paths_to_archive.end())
|
||||
{
|
||||
if (first)
|
||||
throw Exception(
|
||||
ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE,
|
||||
"Cannot extract table structure from {} format file, because all files are empty. You must specify table structure manually",
|
||||
format);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto file_stat = getFileStat(*archive_it, false, -1, "File");
|
||||
path = *archive_it++;
|
||||
file_stat = getFileStat(path, false, -1, "File");
|
||||
}
|
||||
while (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0);
|
||||
|
||||
first = false;
|
||||
return createReadBuffer(*path_it, file_stat, false, -1, compression_method, context, path);
|
||||
|
||||
return createReadBuffer(*path_it, file_stat, false, -1, compression_method, context, *archive_it);
|
||||
};
|
||||
}
|
||||
|
||||
@ -1012,13 +1043,39 @@ Pipe StorageFile::read(
|
||||
|
||||
if (!paths_to_archive.empty())
|
||||
{
|
||||
if (paths.size() != 1)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Multiple paths defined for reading from archive");
|
||||
|
||||
const auto & path = paths[0];
|
||||
|
||||
IArchiveReader::NameFilter filter;
|
||||
if (path.find_first_of("*?{") != std::string::npos)
|
||||
{
|
||||
auto matcher = std::make_shared<re2::RE2>(makeRegexpPatternFromGlobs(path));
|
||||
if (!matcher->ok())
|
||||
throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP,
|
||||
"Cannot compile regex from glob ({}): {}", path, matcher->error());
|
||||
|
||||
filter = [matcher](const std::string & p)
|
||||
{
|
||||
return re2::RE2::FullMatch(p, *matcher);
|
||||
};
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < paths_to_archive.size(); ++i)
|
||||
{
|
||||
const auto & path_to_archive = paths_to_archive[i];
|
||||
auto archive_reader = createArchiveReader(path_to_archive);
|
||||
auto files = archive_reader->getAllFiles();
|
||||
for (auto & file : files)
|
||||
files_in_archive.push_back({i, std::move(file)});
|
||||
if (filter)
|
||||
{
|
||||
const auto & path_to_archive = paths_to_archive[i];
|
||||
auto archive_reader = createArchiveReader(path_to_archive);
|
||||
auto files = archive_reader->getAllFiles(filter);
|
||||
for (auto & file : files)
|
||||
files_in_archive.push_back({i, std::move(file)});
|
||||
}
|
||||
else
|
||||
{
|
||||
files_in_archive.push_back({i, path});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user