2021-10-04 13:46:03 +00:00
|
|
|
#if !defined(ARCADIA_BUILD)
|
|
|
|
#include <Common/config.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if USE_AZURE_BLOB_STORAGE
|
|
|
|
|
|
|
|
#include <iostream>
|
|
|
|
|
|
|
|
#include <IO/ReadBufferFromBlobStorage.h>
|
|
|
|
#include <IO/ReadBufferFromString.h>
|
2021-10-08 14:34:40 +00:00
|
|
|
#include <common/logger_useful.h>
|
|
|
|
|
2021-10-04 13:46:03 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int CANNOT_SEEK_THROUGH_FILE;
|
|
|
|
extern const int SEEK_POSITION_OUT_OF_BOUND;
|
2021-10-15 09:04:22 +00:00
|
|
|
extern const int RECEIVED_EMPTY_DATA;
|
2021-10-04 13:46:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
ReadBufferFromBlobStorage::ReadBufferFromBlobStorage(
|
2021-10-19 09:30:15 +00:00
|
|
|
std::shared_ptr<Azure::Storage::Blobs::BlobContainerClient> blob_container_client_,
|
2021-10-04 13:46:03 +00:00
|
|
|
const String & path_,
|
2021-10-11 14:54:22 +00:00
|
|
|
size_t buf_size_) :
|
2021-10-04 13:46:03 +00:00
|
|
|
SeekableReadBuffer(nullptr, 0),
|
|
|
|
blob_container_client(blob_container_client_),
|
2021-10-11 14:54:22 +00:00
|
|
|
tmp_buffer(buf_size_),
|
|
|
|
path(path_),
|
|
|
|
buf_size(buf_size_) {}
|
2021-10-04 13:46:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
bool ReadBufferFromBlobStorage::nextImpl()
|
|
|
|
{
|
2021-10-11 14:54:22 +00:00
|
|
|
// TODO: is this "stream" approach better than a single DownloadTo approach (last commit 90fc230c4dfacc1a9d50d2d65b91363150caa784) ?
|
2021-10-04 13:46:03 +00:00
|
|
|
|
2021-10-11 14:54:22 +00:00
|
|
|
if (!initialized)
|
|
|
|
initialize();
|
2021-10-08 14:34:40 +00:00
|
|
|
|
2021-10-11 14:54:22 +00:00
|
|
|
if (static_cast<size_t>(offset) >= total_size)
|
|
|
|
return false;
|
2021-10-08 14:34:40 +00:00
|
|
|
|
2021-10-11 14:54:22 +00:00
|
|
|
size_t to_read_bytes = std::min(total_size - offset, buf_size);
|
|
|
|
size_t bytes_read = data_stream->Read(tmp_buffer.data(), to_read_bytes);
|
2021-10-04 13:46:03 +00:00
|
|
|
|
2021-10-11 14:54:22 +00:00
|
|
|
BufferBase::set(reinterpret_cast<char *>(tmp_buffer.data()), bytes_read, 0);
|
|
|
|
offset += bytes_read;
|
2021-10-04 13:46:03 +00:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
off_t ReadBufferFromBlobStorage::seek(off_t offset_, int whence)
|
|
|
|
{
|
2021-10-11 14:54:22 +00:00
|
|
|
if (initialized)
|
2021-10-04 13:46:03 +00:00
|
|
|
throw Exception("Seek is allowed only before first read attempt from the buffer.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
|
|
|
|
|
|
|
if (whence != SEEK_SET)
|
|
|
|
throw Exception("Only SEEK_SET mode is allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
|
|
|
|
|
|
|
if (offset_ < 0)
|
|
|
|
throw Exception("Seek position is out of bounds. Offset: " + std::to_string(offset_), ErrorCodes::SEEK_POSITION_OUT_OF_BOUND);
|
|
|
|
|
|
|
|
offset = offset_;
|
|
|
|
|
|
|
|
return offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
off_t ReadBufferFromBlobStorage::getPosition()
|
|
|
|
{
|
2021-10-11 14:54:22 +00:00
|
|
|
// TODO: which one is the right one?
|
|
|
|
// return offset - available();
|
|
|
|
|
|
|
|
return offset;
|
2021-10-04 13:46:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-10-11 14:54:22 +00:00
|
|
|
void ReadBufferFromBlobStorage::initialize()
|
2021-10-04 13:46:03 +00:00
|
|
|
{
|
2021-10-11 14:54:22 +00:00
|
|
|
if (initialized)
|
|
|
|
return;
|
|
|
|
|
2021-10-19 09:30:15 +00:00
|
|
|
auto blob_client = blob_container_client->GetBlobClient(path);
|
2021-10-11 14:54:22 +00:00
|
|
|
auto download_response = blob_client.Download();
|
|
|
|
data_stream = std::move(download_response.Value.BodyStream);
|
|
|
|
|
|
|
|
if (data_stream == nullptr)
|
|
|
|
{
|
2021-10-15 09:04:22 +00:00
|
|
|
throw Exception("Null data stream obtained while downloading a file from Blob Storage", ErrorCodes::RECEIVED_EMPTY_DATA);
|
2021-10-11 14:54:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
total_size = data_stream->Length();
|
|
|
|
|
|
|
|
if (offset != 0)
|
|
|
|
{
|
|
|
|
// TODO: is it the right way?
|
|
|
|
/// try to rewind to offset in the buffer
|
|
|
|
size_t total_read_bytes = 0;
|
|
|
|
while (total_read_bytes < static_cast<size_t>(offset))
|
|
|
|
{
|
|
|
|
size_t to_read_bytes = std::min(offset - total_read_bytes, buf_size);
|
|
|
|
size_t bytes_read = data_stream->Read(tmp_buffer.data(), to_read_bytes);
|
|
|
|
total_read_bytes += bytes_read;
|
|
|
|
}
|
|
|
|
}
|
2021-10-04 13:46:03 +00:00
|
|
|
|
2021-10-11 14:54:22 +00:00
|
|
|
initialized = true;
|
2021-10-04 13:46:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|