mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-29 13:10:48 +00:00
ReadBufferFromBlobStorage operating like a proper stream, rather than a single download-read buffer
This commit is contained in:
parent
95816875f4
commit
956348073b
@ -25,65 +25,32 @@ ReadBufferFromBlobStorage::ReadBufferFromBlobStorage(
|
|||||||
Azure::Storage::Blobs::BlobContainerClient blob_container_client_,
|
Azure::Storage::Blobs::BlobContainerClient blob_container_client_,
|
||||||
const String & path_,
|
const String & path_,
|
||||||
UInt64 max_single_read_retries_,
|
UInt64 max_single_read_retries_,
|
||||||
size_t /* buf_size_ */) :
|
size_t buf_size_) :
|
||||||
SeekableReadBuffer(nullptr, 0),
|
SeekableReadBuffer(nullptr, 0),
|
||||||
blob_container_client(blob_container_client_),
|
blob_container_client(blob_container_client_),
|
||||||
|
tmp_buffer(buf_size_),
|
||||||
max_single_read_retries(max_single_read_retries_),
|
max_single_read_retries(max_single_read_retries_),
|
||||||
path(path_) {}
|
path(path_),
|
||||||
|
buf_size(buf_size_) {}
|
||||||
|
|
||||||
|
|
||||||
bool ReadBufferFromBlobStorage::nextImpl()
|
bool ReadBufferFromBlobStorage::nextImpl()
|
||||||
{
|
{
|
||||||
bool next_result = false;
|
// TODO: is this "stream" approach better than a single DownloadTo approach (last commit 90fc230c4dfacc1a9d50d2d65b91363150caa784) ?
|
||||||
|
|
||||||
if (impl)
|
if (!initialized)
|
||||||
{
|
initialize();
|
||||||
/// `impl` has been initialized earlier and now we're at the end of the current portion of data.
|
|
||||||
impl->position() = position();
|
|
||||||
assert(!impl->hasPendingData());
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/// `impl` is not initialized and we're about to read the first portion of data.
|
|
||||||
impl = initialize();
|
|
||||||
next_result = impl->hasPendingData();
|
|
||||||
}
|
|
||||||
|
|
||||||
auto sleep_time_with_backoff_milliseconds = std::chrono::milliseconds(100);
|
if (static_cast<size_t>(offset) >= total_size)
|
||||||
for (size_t attempt = 0; (attempt < max_single_read_retries) && !next_result; ++attempt)
|
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
|
||||||
/// Try to read a next portion of data.
|
|
||||||
next_result = impl->next();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
catch (const Exception & e)
|
|
||||||
{
|
|
||||||
// TODO: can't get this to compile, getting "error: reference to overloaded function could not be resolved; did you mean to call it?"
|
|
||||||
// LOG_DEBUG(log, "Caught exception while reading Blob Storage object. Object: {}, Offset: {}, Attempt: {}, Message: {}",
|
|
||||||
// path, getPosition(), attempt, e.message());
|
|
||||||
|
|
||||||
std::cout << "Caught exception while reading Blob Storage object. Object: " << path << ", Offset: "
|
|
||||||
<< getPosition() << ", Attempt: " << attempt << ", Message: " << e.message() << "\n";
|
|
||||||
|
|
||||||
/// Pause before next attempt.
|
|
||||||
std::this_thread::sleep_for(sleep_time_with_backoff_milliseconds);
|
|
||||||
sleep_time_with_backoff_milliseconds *= 2;
|
|
||||||
|
|
||||||
/// Try to reinitialize `impl`.
|
|
||||||
impl.reset();
|
|
||||||
impl = initialize();
|
|
||||||
next_result = impl->hasPendingData();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!next_result)
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
BufferBase::set(impl->buffer().begin(), impl->buffer().size(), impl->offset()); /// use the buffer returned by `impl`
|
size_t to_read_bytes = std::min(total_size - offset, buf_size);
|
||||||
|
|
||||||
offset += working_buffer.size();
|
size_t bytes_read = data_stream->Read(tmp_buffer.data(), to_read_bytes);
|
||||||
|
|
||||||
|
BufferBase::set(reinterpret_cast<char *>(tmp_buffer.data()), bytes_read, 0);
|
||||||
|
|
||||||
|
offset += bytes_read;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -91,7 +58,7 @@ bool ReadBufferFromBlobStorage::nextImpl()
|
|||||||
|
|
||||||
off_t ReadBufferFromBlobStorage::seek(off_t offset_, int whence)
|
off_t ReadBufferFromBlobStorage::seek(off_t offset_, int whence)
|
||||||
{
|
{
|
||||||
if (impl)
|
if (initialized)
|
||||||
throw Exception("Seek is allowed only before first read attempt from the buffer.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
throw Exception("Seek is allowed only before first read attempt from the buffer.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
||||||
|
|
||||||
if (whence != SEEK_SET)
|
if (whence != SEEK_SET)
|
||||||
@ -108,26 +75,50 @@ off_t ReadBufferFromBlobStorage::seek(off_t offset_, int whence)
|
|||||||
|
|
||||||
off_t ReadBufferFromBlobStorage::getPosition()
|
off_t ReadBufferFromBlobStorage::getPosition()
|
||||||
{
|
{
|
||||||
return offset - available();
|
// TODO: which one is the right one?
|
||||||
|
// return offset - available();
|
||||||
|
|
||||||
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::unique_ptr<ReadBuffer> ReadBufferFromBlobStorage::initialize()
|
void ReadBufferFromBlobStorage::initialize()
|
||||||
{
|
{
|
||||||
|
if (initialized)
|
||||||
|
return;
|
||||||
|
|
||||||
auto blob_client = blob_container_client.GetBlobClient(path);
|
auto blob_client = blob_container_client.GetBlobClient(path);
|
||||||
auto prop = blob_client.GetProperties();
|
|
||||||
auto blob_size = prop.Value.BlobSize;
|
|
||||||
|
|
||||||
#ifdef VERBOSE_DEBUG_MODE
|
auto download_response = blob_client.Download();
|
||||||
std::cout << "path: " << path << "\n";
|
|
||||||
std::cout << "blob_size: " << blob_size << "\n";
|
|
||||||
#endif
|
|
||||||
|
|
||||||
tmp_buffer.resize(blob_size);
|
data_stream = std::move(download_response.Value.BodyStream);
|
||||||
|
|
||||||
blob_client.DownloadTo(tmp_buffer.data(), blob_size);
|
if (data_stream == nullptr)
|
||||||
|
{
|
||||||
|
// TODO: change error code
|
||||||
|
throw Exception("Null data stream obtained from blob Download", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
||||||
|
}
|
||||||
|
|
||||||
return std::make_unique<ReadBufferFromString>(tmp_buffer);
|
total_size = data_stream->Length();
|
||||||
|
|
||||||
|
if (offset != 0)
|
||||||
|
{
|
||||||
|
// TODO: is it the right way?
|
||||||
|
/// try to rewind to offset in the buffer
|
||||||
|
size_t total_read_bytes = 0;
|
||||||
|
while (total_read_bytes < static_cast<size_t>(offset))
|
||||||
|
{
|
||||||
|
size_t to_read_bytes = std::min(offset - total_read_bytes, buf_size);
|
||||||
|
size_t bytes_read = data_stream->Read(tmp_buffer.data(), to_read_bytes);
|
||||||
|
total_read_bytes += bytes_read;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
initialized = true;
|
||||||
|
|
||||||
|
// TODO: dummy if to avoid warning for max_single_read_retries
|
||||||
|
if (max_single_read_retries == 0)
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -30,14 +30,17 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
std::unique_ptr<ReadBuffer> initialize();
|
void initialize();
|
||||||
|
|
||||||
|
std::unique_ptr<Azure::Core::IO::BodyStream> data_stream;
|
||||||
Azure::Storage::Blobs::BlobContainerClient blob_container_client;
|
Azure::Storage::Blobs::BlobContainerClient blob_container_client;
|
||||||
std::unique_ptr<ReadBuffer> impl;
|
|
||||||
std::vector<uint8_t> tmp_buffer;
|
std::vector<uint8_t> tmp_buffer;
|
||||||
UInt64 max_single_read_retries;
|
UInt64 max_single_read_retries; // TODO: unused field
|
||||||
const String & path;
|
const String & path;
|
||||||
off_t offset = 0;
|
off_t offset = 0;
|
||||||
|
size_t buf_size;
|
||||||
|
size_t total_size;
|
||||||
|
bool initialized = false;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user