add SevenZipArchiveReader/TarArchiveReader

This commit is contained in:
Nikita Keba 2023-05-29 17:10:03 +00:00
parent f07999699f
commit 19a0fbeccc
5 changed files with 458 additions and 0 deletions

View File

@ -0,0 +1,166 @@
#include "SevenZipArchiveReader.h"
#include <IO/ReadBufferFromFileBase.h>
#include <Common/quoteString.h>
namespace DB{
namespace ErrorCodes
{
extern const int CANNOT_UNPACK_ARCHIVE;
extern const int LOGICAL_ERROR;
extern const int SEEK_POSITION_OUT_OF_BOUND;
}
class SevenZipArchiveReader::Handle {
public:
Handle(const String & path_to_archive_)
: path_to_archive(path_to_archive_) {
archive = archive_read_new();
archive_read_support_filter_all(archive);
archive_read_support_format_all(archive);
if (archive_read_open_filename(archive, path_to_archive.c_str(), 10240) != ARCHIVE_OK) {
throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't open 7z archive {}", quoteString(path_to_archive));
}
entry = archive_entry_new();
}
~Handle() {
archive_read_close(archive);
archive_read_free(archive);
}
bool locateFile(const String &filename) {
while (archive_read_next_header(archive, &entry) == ARCHIVE_OK)
{
if (archive_entry_pathname(entry) == filename)
return true;
}
return false;
}
struct archive* archive;
struct archive_entry* entry;
private:
const String path_to_archive;
};
class SevenZipArchiveReader::ReadBufferFromSevenZipArchive : public ReadBufferFromFileBase
{
public:
explicit ReadBufferFromSevenZipArchive(const String & path_to_archive_, const String & filename_)
: ReadBufferFromFileBase(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0), handle(path_to_archive_), path_to_archive(path_to_archive_), filename(filename_)
{
handle.locateFile(filename_);
}
off_t seek(off_t off, int whence) override
{
off_t current_pos = getPosition();
off_t new_pos;
if (whence == SEEK_SET)
new_pos = off;
else if (whence == SEEK_CUR)
new_pos = off + current_pos;
else
throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Only SEEK_SET and SEEK_CUR seek modes allowed.");
if (new_pos == current_pos)
return current_pos; /// The position is the same.
if (new_pos < 0)
throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bound");
off_t working_buffer_start_pos = current_pos - offset();
off_t working_buffer_end_pos = current_pos + available();
if ((working_buffer_start_pos <= new_pos) && (new_pos <= working_buffer_end_pos))
{
/// The new position is still inside the buffer.
position() += new_pos - current_pos;
return new_pos;
}
/// Check that the new position is now beyond the end of the file.
if (new_pos > archive_entry_size(handle.entry))
throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bound");
ignore(new_pos - current_pos);
return new_pos;
}
off_t getPosition() override
{
return archive_entry_size(handle.entry) - available();
}
String getFileName() const override { return filename; }
private:
bool nextImpl() override
{
auto bytes_read = archive_read_data(handle.archive, internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
if (!bytes_read)
return false;
working_buffer = internal_buffer;
working_buffer.resize(bytes_read);
return true;
}
Handle handle;
const String path_to_archive;
const String filename;
};
SevenZipArchiveReader::SevenZipArchiveReader(const String & path_to_archive_)
: path_to_archive(path_to_archive_) {
}
SevenZipArchiveReader::SevenZipArchiveReader(const String & path_to_archive_, const ReadArchiveFunction & archive_read_function_, UInt64 archive_size_): path_to_archive(path_to_archive_), archive_read_function(archive_read_function_), archive_size(archive_size_) {}
SevenZipArchiveReader::~SevenZipArchiveReader() {}
bool SevenZipArchiveReader::fileExists(const String& filename)
{
Handle handle(path_to_archive);
return handle.locateFile(filename);
}
SevenZipArchiveReader::FileInfo SevenZipArchiveReader::getFileInfo(const String & filename) {
Handle handle(path_to_archive);
handle.locateFile(filename);
FileInfo info;
info.uncompressed_size = archive_entry_size(handle.entry);
info.compressed_size = archive_entry_size(handle.entry);
info.is_encrypted = false;
return info;
}
std::unique_ptr<SevenZipArchiveReader::FileEnumerator> SevenZipArchiveReader::firstFile() {
return nullptr;
}
std::unique_ptr<ReadBufferFromFileBase> SevenZipArchiveReader::readFile(const String & filename) {
Handle handle(path_to_archive);
handle.locateFile(filename);
return std::make_unique<ReadBufferFromSevenZipArchive>(path_to_archive, filename);
}
std::unique_ptr<ReadBufferFromFileBase> SevenZipArchiveReader::readFile([[maybe_unused]] std::unique_ptr<FileEnumerator> enumerator) {
return nullptr;
}
std::unique_ptr<SevenZipArchiveReader::FileEnumerator> SevenZipArchiveReader::nextFile([[maybe_unused]] std::unique_ptr<ReadBuffer> read_buffer) {
return nullptr;
}
void SevenZipArchiveReader::setPassword([[maybe_unused]] const String & password_) {
throw Exception(ErrorCodes::LOGICAL_ERROR, "Can not set password to .7z archive");
}
}

View File

@ -0,0 +1,62 @@
#pragma once
#include <IO/Archives/IArchiveReader.h>
#include <archive.h>
#include <archive_entry.h>
#include <iostream>
namespace DB
{
class ReadBuffer;
class ReadBufferFromFileBase;
class SeekableReadBuffer;
/// Implementation of IArchiveReader for reading SevenZip archives.
class SevenZipArchiveReader : public IArchiveReader
{
public:
/// Constructs an archive's reader that will read from a file in the local filesystem.
explicit SevenZipArchiveReader(const String & path_to_archive_);
/// Constructs an archive's reader that will read by making a read buffer by using
/// a specified function.
SevenZipArchiveReader(const String & path_to_archive_, const ReadArchiveFunction & archive_read_function_, UInt64 archive_size_);
~SevenZipArchiveReader() override;
/// Returns true if there is a specified file in the archive.
bool fileExists(const String & filename) override;
/// Returns the information about a file stored in the archive.
FileInfo getFileInfo(const String & filename) override;
/// Starts enumerating files in the archive.
std::unique_ptr<FileEnumerator> firstFile() override;
/// Starts reading a file from the archive. The function returns a read buffer,
/// you can read that buffer to extract uncompressed data from the archive.
/// Several read buffers can be used at the same time in parallel.
std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) override;
/// It's possible to convert a file enumerator to a read buffer and vice versa.
[[maybe_unused]] std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) override;
[[maybe_unused]] std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) override;
/// Sets password used to decrypt the contents of the files in the archive.
void setPassword([[maybe_unused]] const String & password_) override;
private:
class ReadBufferFromSevenZipArchive;
class Handle;
const String path_to_archive;
String password;
const ReadArchiveFunction archive_read_function;
[[maybe_unused]] const UInt64 archive_size = 0;
};
}

View File

@ -0,0 +1,165 @@
#include "TarArchiveReader.h"
#include <IO/ReadBufferFromFileBase.h>
#include <Common/quoteString.h>
namespace DB{
namespace ErrorCodes
{
extern const int CANNOT_UNPACK_ARCHIVE;
extern const int LOGICAL_ERROR;
extern const int SEEK_POSITION_OUT_OF_BOUND;
}
class TarArchiveReader::Handle {
public:
Handle(const String & path_to_archive_)
: path_to_archive(path_to_archive_) {
archive = archive_read_new();
archive_read_support_filter_all(archive);
archive_read_support_format_all(archive);
if (archive_read_open_filename(archive, path_to_archive.c_str(), 10240) != ARCHIVE_OK) {
throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't open tar archive {}", quoteString(path_to_archive));
}
entry = archive_entry_new();
}
~Handle() {
archive_read_close(archive);
archive_read_free(archive);
}
bool locateFile(const String &filename) {
while (archive_read_next_header(archive, &entry) == ARCHIVE_OK)
{
if (archive_entry_pathname(entry) == filename)
return true;
}
return false;
}
struct archive* archive;
struct archive_entry* entry;
private:
const String path_to_archive;
};
class TarArchiveReader::ReadBufferFromTarArchive : public ReadBufferFromFileBase
{
public:
explicit ReadBufferFromTarArchive(const String & path_to_archive_, const String & filename_)
: ReadBufferFromFileBase(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0), handle(path_to_archive_), path_to_archive(path_to_archive_), filename(filename_)
{
handle.locateFile(filename_);
}
off_t seek(off_t off, int whence) override
{
off_t current_pos = getPosition();
off_t new_pos;
if (whence == SEEK_SET)
new_pos = off;
else if (whence == SEEK_CUR)
new_pos = off + current_pos;
else
throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Only SEEK_SET and SEEK_CUR seek modes allowed.");
if (new_pos == current_pos)
return current_pos; /// The position is the same.
if (new_pos < 0)
throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bound");
off_t working_buffer_start_pos = current_pos - offset();
off_t working_buffer_end_pos = current_pos + available();
if ((working_buffer_start_pos <= new_pos) && (new_pos <= working_buffer_end_pos))
{
/// The new position is still inside the buffer.
position() += new_pos - current_pos;
return new_pos;
}
/// Check that the new position is now beyond the end of the file.
if (new_pos > archive_entry_size(handle.entry))
throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bound");
ignore(new_pos - current_pos);
return new_pos;
}
off_t getPosition() override
{
return archive_entry_size(handle.entry) - available();
}
String getFileName() const override { return filename; }
private:
bool nextImpl() override
{
auto bytes_read = archive_read_data(handle.archive, internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
if (!bytes_read)
return false;
working_buffer = internal_buffer;
working_buffer.resize(bytes_read);
return true;
}
Handle handle;
const String path_to_archive;
const String filename;
};
TarArchiveReader::TarArchiveReader(const String & path_to_archive_)
: path_to_archive(path_to_archive_) {}
TarArchiveReader::TarArchiveReader(const String & path_to_archive_, const ReadArchiveFunction & archive_read_function_, UInt64 archive_size_): path_to_archive(path_to_archive_), archive_read_function(archive_read_function_), archive_size(archive_size_) {}
TarArchiveReader::~TarArchiveReader() {}
bool TarArchiveReader::fileExists(const String& filename)
{
Handle handle(path_to_archive);
return handle.locateFile(filename);
}
TarArchiveReader::FileInfo TarArchiveReader::getFileInfo(const String & filename) {
Handle handle(path_to_archive);
handle.locateFile(filename);
FileInfo info;
info.uncompressed_size = archive_entry_size(handle.entry);
info.compressed_size = archive_entry_size(handle.entry);
info.is_encrypted = false;
return info;
}
std::unique_ptr<TarArchiveReader::FileEnumerator> TarArchiveReader::firstFile() {
return nullptr;
}
std::unique_ptr<ReadBufferFromFileBase> TarArchiveReader::readFile(const String & filename) {
Handle handle(path_to_archive);
handle.locateFile(filename);
return std::make_unique<ReadBufferFromTarArchive>(path_to_archive, filename);
}
std::unique_ptr<ReadBufferFromFileBase> TarArchiveReader::readFile([[maybe_unused]] std::unique_ptr<FileEnumerator> enumerator) {
return nullptr;
}
std::unique_ptr<TarArchiveReader::FileEnumerator> TarArchiveReader::nextFile([[maybe_unused]] std::unique_ptr<ReadBuffer> read_buffer) {
return nullptr;
}
void TarArchiveReader::setPassword([[maybe_unused]] const String & password_) {
throw Exception(ErrorCodes::LOGICAL_ERROR, "Can not set password to .tar archive");
}
}

View File

@ -0,0 +1,59 @@
#pragma once
#include <IO/Archives/IArchiveReader.h>
#include <archive.h>
#include <archive_entry.h>
namespace DB
{
class ReadBuffer;
class ReadBufferFromFileBase;
class SeekableReadBuffer;
/// Implementation of IArchiveReader for reading tar archives.
class TarArchiveReader : public IArchiveReader
{
public:
/// Constructs an archive's reader that will read from a file in the local filesystem.
explicit TarArchiveReader(const String & path_to_archive_);
/// Constructs an archive's reader that will read by making a read buffer by using
/// a specified function.
TarArchiveReader(const String & path_to_archive_, const ReadArchiveFunction & archive_read_function_, UInt64 archive_size_);
~TarArchiveReader() override;
/// Returns true if there is a specified file in the archive.
bool fileExists(const String & filename) override;
/// Returns the information about a file stored in the archive.
FileInfo getFileInfo(const String & filename) override;
/// Starts enumerating files in the archive.
std::unique_ptr<FileEnumerator> firstFile() override;
/// Starts reading a file from the archive. The function returns a read buffer,
/// you can read that buffer to extract uncompressed data from the archive.
/// Several read buffers can be used at the same time in parallel.
std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) override;
/// It's possible to convert a file enumerator to a read buffer and vice versa.
[[maybe_unused]] std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) override;
[[maybe_unused]] std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) override;
/// Sets password used to decrypt the contents of the files in the archive.
void setPassword([[maybe_unused]] const String & password_) override;
private:
class ReadBufferFromTarArchive;
class Handle;
const String path_to_archive;
const ReadArchiveFunction archive_read_function;
[[maybe_unused]] const UInt64 archive_size = 0;
};
}

View File

@ -1,5 +1,7 @@
#include <IO/Archives/createArchiveReader.h> #include <IO/Archives/createArchiveReader.h>
#include <IO/Archives/ZipArchiveReader.h> #include <IO/Archives/ZipArchiveReader.h>
#include <IO/Archives/TarArchiveReader.h>
#include <IO/Archives/SevenZipArchiveReader.h>
#include <Common/Exception.h> #include <Common/Exception.h>
@ -30,6 +32,10 @@ std::shared_ptr<IArchiveReader> createArchiveReader(
#else #else
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "minizip library is disabled"); throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "minizip library is disabled");
#endif #endif
} else if (path_to_archive.ends_with(".tar")) {
return std::make_shared<TarArchiveReader>(path_to_archive, archive_read_function, archive_size);
} else if (path_to_archive.ends_with(".7z")) {
return std::make_shared<SevenZipArchiveReader>(path_to_archive, archive_read_function, archive_size);
} }
else else
throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Cannot determine the type of archive {}", path_to_archive); throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Cannot determine the type of archive {}", path_to_archive);