ClickHouse/src/Interpreters/TemporaryDataOnDisk.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

389 lines
12 KiB
C++
Raw Normal View History

#include <Interpreters/TemporaryDataOnDisk.h>
#include <IO/WriteBufferFromFile.h>
#include <IO/ReadBufferFromFile.h>
#include <Compression/CompressedWriteBuffer.h>
#include <Compression/CompressedReadBuffer.h>
#include <Formats/NativeWriter.h>
#include <Formats/NativeReader.h>
2022-09-01 12:22:49 +00:00
#include <Core/ProtocolDefines.h>
#include <Disks/SingleDiskVolume.h>
#include <Disks/DiskLocal.h>
#include <Disks/IO/WriteBufferFromTemporaryFile.h>
2022-09-01 12:22:49 +00:00
#include <Common/logger_useful.h>
2022-12-06 17:27:05 +00:00
#include <Interpreters/Cache/WriteBufferToFileSegment.h>
namespace DB
{
2022-09-01 12:22:49 +00:00
namespace ErrorCodes
{
2022-09-21 12:51:46 +00:00
extern const int TOO_MANY_ROWS_OR_BYTES;
extern const int LOGICAL_ERROR;
2022-09-01 12:22:49 +00:00
extern const int NOT_ENOUGH_SPACE;
}
void TemporaryDataOnDiskScope::deltaAllocAndCheck(ssize_t compressed_delta, ssize_t uncompressed_delta)
{
if (parent)
2022-09-21 12:51:46 +00:00
parent->deltaAllocAndCheck(compressed_delta, uncompressed_delta);
/// check that we don't go negative
if ((compressed_delta < 0 && stat.compressed_size < static_cast<size_t>(-compressed_delta)) ||
(uncompressed_delta < 0 && stat.uncompressed_size < static_cast<size_t>(-uncompressed_delta)))
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "Negative temporary data size");
}
size_t new_consumprion = stat.compressed_size + compressed_delta;
if (compressed_delta > 0 && limit && new_consumprion > limit)
2022-12-06 10:04:15 +00:00
throw Exception(ErrorCodes::TOO_MANY_ROWS_OR_BYTES,
"Limit for temporary files size exceeded (would consume {} / {} bytes)", new_consumprion, limit);
2022-09-21 12:51:46 +00:00
stat.compressed_size += compressed_delta;
stat.uncompressed_size += uncompressed_delta;
}
TemporaryDataOnDisk::TemporaryDataOnDisk(TemporaryDataOnDiskScopePtr parent_)
: TemporaryDataOnDiskScope(std::move(parent_), /* limit_ = */ 0)
{}
TemporaryDataOnDisk::TemporaryDataOnDisk(TemporaryDataOnDiskScopePtr parent_, CurrentMetrics::Metric metric_scope)
: TemporaryDataOnDiskScope(std::move(parent_), /* limit_ = */ 0)
, current_metric_scope(metric_scope)
{}
WriteBufferPtr TemporaryDataOnDisk::createRawStream(size_t max_file_size)
{
if (file_cache)
{
auto holder = createCacheFile(max_file_size);
return std::make_shared<WriteBufferToFileSegment>(std::move(holder));
}
else if (volume)
{
auto tmp_file = createRegularFile(max_file_size);
return std::make_shared<WriteBufferFromTemporaryFile>(std::move(tmp_file));
}
throw Exception(ErrorCodes::LOGICAL_ERROR, "TemporaryDataOnDiskScope has no cache and no volume");
}
2022-12-06 17:27:05 +00:00
TemporaryFileStream & TemporaryDataOnDisk::createStream(const Block & header, size_t max_file_size)
2022-12-06 10:04:15 +00:00
{
2022-12-06 17:27:05 +00:00
if (file_cache)
{
auto holder = createCacheFile(max_file_size);
std::lock_guard lock(mutex);
TemporaryFileStreamPtr & tmp_stream = streams.emplace_back(std::make_unique<TemporaryFileStream>(std::move(holder), header, this));
return *tmp_stream;
}
2022-12-06 17:27:05 +00:00
else if (volume)
{
auto tmp_file = createRegularFile(max_file_size);
std::lock_guard lock(mutex);
TemporaryFileStreamPtr & tmp_stream = streams.emplace_back(std::make_unique<TemporaryFileStream>(std::move(tmp_file), header, this));
return *tmp_stream;
}
2022-12-06 17:27:05 +00:00
throw Exception(ErrorCodes::LOGICAL_ERROR, "TemporaryDataOnDiskScope has no cache and no volume");
2022-12-06 10:04:15 +00:00
}
FileSegmentsHolderPtr TemporaryDataOnDisk::createCacheFile(size_t max_file_size)
2022-12-06 17:27:05 +00:00
{
if (!file_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "TemporaryDataOnDiskScope has no cache");
2022-12-06 17:27:05 +00:00
const auto key = FileSegment::Key::random();
auto holder = file_cache->set(key, 0, std::max(10_MiB, max_file_size), CreateFileSegmentSettings(FileSegmentKind::Temporary, /* unbounded */ true));
fs::create_directories(file_cache->getPathInLocalCache(key));
return holder;
2022-12-06 17:27:05 +00:00
}
TemporaryFileOnDiskHolder TemporaryDataOnDisk::createRegularFile(size_t max_file_size)
{
if (!volume)
throw Exception(ErrorCodes::LOGICAL_ERROR, "TemporaryDataOnDiskScope has no volume");
DiskPtr disk;
2022-12-06 17:27:05 +00:00
if (max_file_size > 0)
{
auto reservation = volume->reserve(max_file_size);
if (!reservation)
throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Not enough space on temporary disk");
disk = reservation->getDisk();
}
else
{
disk = volume->getDisk();
}
return std::make_unique<TemporaryFileOnDisk>(disk, current_metric_scope);
}
2022-09-28 10:22:16 +00:00
std::vector<TemporaryFileStream *> TemporaryDataOnDisk::getStreams() const
{
std::vector<TemporaryFileStream *> res;
std::lock_guard lock(mutex);
2022-09-28 10:22:16 +00:00
res.reserve(streams.size());
for (const auto & stream : streams)
res.push_back(stream.get());
return res;
}
2022-09-21 12:51:46 +00:00
bool TemporaryDataOnDisk::empty() const
{
std::lock_guard lock(mutex);
return streams.empty();
}
struct TemporaryFileStream::OutputWriter
{
OutputWriter(std::unique_ptr<WriteBuffer> out_buf_, const Block & header_)
2022-12-06 17:27:05 +00:00
: out_buf(std::move(out_buf_))
, out_compressed_buf(*out_buf)
, out_writer(out_compressed_buf, DBMS_TCP_PROTOCOL_VERSION, header_)
{
}
2022-12-06 10:04:15 +00:00
size_t write(const Block & block)
{
if (finalized)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write to finalized stream");
2022-12-06 10:04:15 +00:00
size_t written_bytes = out_writer.write(block);
2022-10-05 16:35:10 +00:00
num_rows += block.rows();
2022-12-06 10:04:15 +00:00
return written_bytes;
}
2022-12-06 17:27:05 +00:00
void flush()
{
if (finalized)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot flush finalized stream");
2022-12-06 17:27:05 +00:00
out_compressed_buf.next();
out_buf->next();
out_writer.flush();
}
void finalize()
{
if (finalized)
return;
2022-09-21 12:51:46 +00:00
/// if we called finalize() explicitly, and got an exception,
/// we don't want to get it again in the destructor, so set finalized flag first
2022-09-14 11:21:25 +00:00
finalized = true;
2022-09-21 12:51:46 +00:00
out_writer.flush();
out_compressed_buf.finalize();
2022-12-06 17:27:05 +00:00
out_buf->finalize();
}
~OutputWriter()
{
try
{
finalize();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
2022-12-06 17:27:05 +00:00
std::unique_ptr<WriteBuffer> out_buf;
CompressedWriteBuffer out_compressed_buf;
NativeWriter out_writer;
2022-10-05 16:35:10 +00:00
std::atomic_size_t num_rows = 0;
bool finalized = false;
};
struct TemporaryFileStream::InputReader
{
InputReader(const String & path, const Block & header_)
: in_file_buf(path)
, in_compressed_buf(in_file_buf)
, in_reader(in_compressed_buf, header_, DBMS_TCP_PROTOCOL_VERSION)
{
2022-12-06 17:27:05 +00:00
LOG_TEST(&Poco::Logger::get("TemporaryFileStream"), "Reading {} from {}", header_.dumpStructure(), path);
}
explicit InputReader(const String & path)
: in_file_buf(path)
, in_compressed_buf(in_file_buf)
, in_reader(in_compressed_buf, DBMS_TCP_PROTOCOL_VERSION)
{
2022-12-06 17:27:05 +00:00
LOG_TEST(&Poco::Logger::get("TemporaryFileStream"), "Reading from {}", path);
}
2022-12-06 17:27:05 +00:00
Block read()
{
return in_reader.read();
}
ReadBufferFromFile in_file_buf;
CompressedReadBuffer in_compressed_buf;
NativeReader in_reader;
};
2022-12-06 17:27:05 +00:00
TemporaryFileStream::TemporaryFileStream(TemporaryFileOnDiskHolder file_, const Block & header_, TemporaryDataOnDisk * parent_)
: parent(parent_)
, header(header_)
2022-09-01 12:22:49 +00:00
, file(std::move(file_))
, out_writer(std::make_unique<OutputWriter>(std::make_unique<WriteBufferFromFile>(file->getAbsolutePath()), header))
{
LOG_TEST(&Poco::Logger::get("TemporaryFileStream"), "Writing to temporary file {}", file->getAbsolutePath());
}
2023-01-04 19:57:42 +00:00
TemporaryFileStream::TemporaryFileStream(FileSegmentsHolderPtr segments_, const Block & header_, TemporaryDataOnDisk * parent_)
2022-12-06 17:27:05 +00:00
: parent(parent_)
, header(header_)
, segment_holder(std::move(segments_))
{
2023-01-04 19:57:42 +00:00
if (segment_holder->size() != 1)
2022-12-06 17:27:05 +00:00
throw Exception(ErrorCodes::LOGICAL_ERROR, "TemporaryFileStream can be created only from single segment");
2023-01-04 19:57:42 +00:00
auto out_buf = std::make_unique<WriteBufferToFileSegment>(&segment_holder->front());
LOG_TEST(&Poco::Logger::get("TemporaryFileStream"), "Writing to temporary file {}", out_buf->getFileName());
2022-12-06 17:27:05 +00:00
out_writer = std::make_unique<OutputWriter>(std::move(out_buf), header);
}
2022-12-06 10:04:15 +00:00
size_t TemporaryFileStream::write(const Block & block)
{
if (!out_writer)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Writing has been finished");
2022-09-21 12:51:46 +00:00
updateAllocAndCheck();
2022-12-06 10:04:15 +00:00
size_t bytes_written = out_writer->write(block);
return bytes_written;
}
2022-12-06 17:27:05 +00:00
void TemporaryFileStream::flush()
{
if (!out_writer)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Writing has been finished");
2022-12-06 17:27:05 +00:00
out_writer->flush();
}
TemporaryFileStream::Stat TemporaryFileStream::finishWriting()
{
2022-10-05 16:35:10 +00:00
if (isWriteFinished())
return stat;
if (out_writer)
{
out_writer->finalize();
2022-09-21 12:51:46 +00:00
/// The amount of written data can be changed after finalization, some buffers can be flushed
/// Need to update the stat
updateAllocAndCheck();
2022-09-01 12:22:49 +00:00
out_writer.reset();
/// reader will be created at the first read call, not to consume memory before it is needed
}
return stat;
}
bool TemporaryFileStream::isWriteFinished() const
{
assert(in_reader == nullptr || out_writer == nullptr);
return out_writer == nullptr;
}
Block TemporaryFileStream::read()
{
2022-09-21 12:51:46 +00:00
if (!isWriteFinished())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Writing has been not finished");
2022-10-05 16:35:10 +00:00
if (isEof())
return {};
if (!in_reader)
{
2022-12-06 17:27:05 +00:00
in_reader = std::make_unique<InputReader>(getPath(), header);
}
Block block = in_reader->read();
if (!block)
{
/// finalize earlier to release resources, do not wait for the destructor
2022-10-05 16:35:10 +00:00
this->release();
}
return block;
}
2022-09-21 12:51:46 +00:00
void TemporaryFileStream::updateAllocAndCheck()
2022-09-01 12:22:49 +00:00
{
2022-09-14 11:21:25 +00:00
assert(out_writer);
2022-09-01 12:22:49 +00:00
size_t new_compressed_size = out_writer->out_compressed_buf.getCompressedBytes();
size_t new_uncompressed_size = out_writer->out_compressed_buf.getUncompressedBytes();
if (unlikely(new_compressed_size < stat.compressed_size || new_uncompressed_size < stat.uncompressed_size))
2022-09-01 12:22:49 +00:00
{
2022-09-21 12:51:46 +00:00
throw Exception(ErrorCodes::LOGICAL_ERROR,
2022-09-01 12:22:49 +00:00
"Temporary file {} size decreased after write: compressed: {} -> {}, uncompressed: {} -> {}",
2022-12-06 17:27:05 +00:00
getPath(), new_compressed_size, stat.compressed_size, new_uncompressed_size, stat.uncompressed_size);
2022-09-01 12:22:49 +00:00
}
2022-09-21 12:51:46 +00:00
parent->deltaAllocAndCheck(new_compressed_size - stat.compressed_size, new_uncompressed_size - stat.uncompressed_size);
stat.compressed_size = new_compressed_size;
stat.uncompressed_size = new_uncompressed_size;
2022-10-05 16:35:10 +00:00
stat.num_rows = out_writer->num_rows;
2022-09-01 12:22:49 +00:00
}
2022-10-05 16:35:10 +00:00
bool TemporaryFileStream::isEof() const
{
return file == nullptr && !segment_holder;
}
2022-10-05 16:35:10 +00:00
void TemporaryFileStream::release()
{
if (in_reader)
in_reader.reset();
if (out_writer)
{
out_writer->finalize();
out_writer.reset();
}
2022-12-06 17:27:05 +00:00
if (file)
{
file.reset();
parent->deltaAllocAndCheck(-stat.compressed_size, -stat.uncompressed_size);
}
if (segment_holder)
segment_holder.reset();
2022-12-06 17:27:05 +00:00
}
String TemporaryFileStream::getPath() const
{
if (file)
return file->getAbsolutePath();
if (segment_holder && !segment_holder->empty())
2023-01-04 19:57:42 +00:00
return segment_holder->front().getPathInLocalCache();
2022-12-06 17:27:05 +00:00
throw Exception(ErrorCodes::LOGICAL_ERROR, "TemporaryFileStream has no file");
}
2022-09-01 12:22:49 +00:00
TemporaryFileStream::~TemporaryFileStream()
{
try
{
2022-10-05 16:35:10 +00:00
release();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
2022-09-21 12:51:46 +00:00
assert(false); /// deltaAllocAndCheck with negative can't throw exception
}
}
}